Merge remote-tracking branch 'upstream/develop' into chore/develop-into-master-v3.1

This commit is contained in:
Adriane Boyd 2021-06-15 15:05:17 +02:00
commit 5646fcbe46
51 changed files with 868 additions and 188 deletions

View File

@ -8,3 +8,4 @@ recursive-exclude spacy/lang *.json
recursive-include spacy/lang *.json.gz
recursive-include spacy/cli *.json *.yml
recursive-include licenses *
recursive-exclude spacy *.cpp

View File

@ -43,8 +43,8 @@ scikit-learn
* Files: scorer.py
The following implementation of roc_auc_score() is adapted from
scikit-learn, which is distributed under the following license:
The implementation of roc_auc_score() is adapted from scikit-learn, which is
distributed under the following license:
New BSD License
@ -77,3 +77,30 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
DAMAGE.
pyvi
----
* Files: lang/vi/__init__.py
The MIT License (MIT)
Copyright (c) 2016 Viet-Trung Tran
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -68,7 +68,7 @@ console_scripts =
[options.extras_require]
lookups =
spacy_lookups_data>=1.0.0,<1.1.0
spacy_lookups_data>=1.0.1,<1.1.0
transformers =
spacy_transformers>=1.0.1,<1.1.0
ray =

View File

@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
__version__ = "3.0.6"
__version__ = "3.1.0.dev0"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"

View File

@ -115,7 +115,8 @@ def convert(
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
doc_files = []
for input_loc in walk_directory(Path(input_path), converter):
input_data = input_loc.open("r", encoding="utf-8").read()
with input_loc.open("r", encoding="utf-8") as infile:
input_data = infile.read()
# Use converter function to convert data
func = CONVERTERS[converter]
docs = func(

View File

@ -112,7 +112,9 @@ def package(
msg.fail("Invalid pipeline meta.json")
print("\n".join(errors))
sys.exit(1)
model_name = meta["lang"] + "_" + meta["name"]
model_name = meta["name"]
if not model_name.startswith(meta["lang"] + "_"):
model_name = f"{meta['lang']}_{model_name}"
model_name_v = model_name + "-" + meta["version"]
main_path = output_dir / model_name_v
package_path = main_path / model_name
@ -128,9 +130,10 @@ def package(
)
Path.mkdir(package_path, parents=True)
shutil.copytree(str(input_dir), str(package_path / model_name_v))
license_path = package_path / model_name_v / "LICENSE"
if license_path.exists():
shutil.move(str(license_path), str(main_path))
for file_name in FILENAMES_DOCS:
file_path = package_path / model_name_v / file_name
if file_path.exists():
shutil.move(str(file_path), str(main_path))
imports = []
for code_path in code_paths:
imports.append(code_path.stem)
@ -294,7 +297,7 @@ def setup_package():
if __name__ == '__main__':
setup_package()
""".strip()
""".lstrip()
TEMPLATE_MANIFEST = """
@ -314,4 +317,7 @@ __version__ = get_model_meta(Path(__file__).parent)['version']
def load(**overrides):
return load_model_from_init_py(__file__, **overrides)
""".strip()
""".lstrip()
FILENAMES_DOCS = ["LICENSE", "LICENSES_SOURCES", "README.md"]

View File

@ -372,7 +372,7 @@ factory = "{{ pipe }}"
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = {{ 500 if hardware == "gpu" else 2000 }}
max_length = 0
[corpora.dev]
@readers = "spacy.Corpus.v1"

View File

@ -80,6 +80,8 @@ eval_frequency = 200
score_weights = {}
# Names of pipeline components that shouldn't be updated during training
frozen_components = []
# Names of pipeline components that should set annotations during training
annotating_components = []
# Location in the config where the dev corpus is defined
dev_corpus = "corpora.dev"
# Location in the config where the train corpus is defined

View File

@ -24,6 +24,9 @@ def setup_default_warnings():
for pipe in ["matcher", "entity_ruler"]:
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
# warn once about lemmatizer without required POS
filter_warning("once", error_msg="[W108]")
def filter_warning(action: str, error_msg: str):
"""Customize how spaCy should handle a certain warning.

View File

@ -28,7 +28,7 @@ cdef class Candidate:
cdef class KnowledgeBase:
cdef Pool mem
cpdef readonly Vocab vocab
cdef readonly Vocab vocab
cdef int64_t entity_vector_length
# This maps 64bit keys (hash of unique entity string)

View File

@ -1,15 +1,23 @@
from typing import Optional
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
from .lemmatizer import CatalanLemmatizer
class CatalanDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
class Catalan(Language):
@ -17,4 +25,16 @@ class Catalan(Language):
Defaults = CatalanDefaults
@Catalan.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
):
return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
__all__ = ["Catalan"]

View File

@ -0,0 +1,81 @@
from typing import List, Tuple
from ...pipeline import Lemmatizer
from ...tokens import Token
class CatalanLemmatizer(Lemmatizer):
"""
Copied from French Lemmatizer
Catalan language lemmatizer applies the default rule based lemmatization
procedure with some modifications for better Catalan language support.
The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use
the rule-based lemmatization. As a last resort, the lemmatizer checks in
the lookup table.
"""
@classmethod
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
if mode == "rule":
required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
return (required, [])
else:
return super().get_lookups_config(mode)
def rule_lemmatize(self, token: Token) -> List[str]:
cache_key = (token.orth, token.pos)
if cache_key in self.cache:
return self.cache[cache_key]
string = token.text
univ_pos = token.pos_.lower()
if univ_pos in ("", "eol", "space"):
return [string.lower()]
elif "lemma_rules" not in self.lookups or univ_pos not in (
"noun",
"verb",
"adj",
"adp",
"adv",
"aux",
"cconj",
"det",
"pron",
"punct",
"sconj",
):
return self.lookup_lemmatize(token)
index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {})
lookup_table = self.lookups.get_table("lemma_lookup", {})
index = index_table.get(univ_pos, {})
exceptions = exc_table.get(univ_pos, {})
rules = rules_table.get(univ_pos, [])
string = string.lower()
forms = []
if string in index:
forms.append(string)
self.cache[cache_key] = forms
return forms
forms.extend(exceptions.get(string, []))
oov_forms = []
if not forms:
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index or not form.isalpha():
forms.append(form)
else:
oov_forms.append(form)
if not forms:
forms.extend(oov_forms)
if not forms and string in lookup_table.keys():
forms.append(self.lookup_lemmatize(token)[0])
if not forms:
forms.append(string)
forms = list(set(forms))
self.cache[cache_key] = forms
return forms

View File

@ -1,12 +1,46 @@
from ..punctuation import TOKENIZER_INFIXES
from ..char_classes import ALPHA
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
from ..char_classes import CURRENCY
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
from ..char_classes import merge_chars, _units
ELISION = " ' ".strip().replace(" ", "").replace("\n", "")
_infixes = TOKENIZER_INFIXES + [
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
]
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
]
)
_units = _units.replace("% ", "")
UNITS = merge_chars(_units)
_suffixes = (
LIST_PUNCT
+ LIST_ELLIPSES
+ LIST_QUOTES
+ LIST_ICONS
+ [r"-", "", ""]
+ [
r"(?<=[0-9])\+",
r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
),
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
]
)
TOKENIZER_INFIXES = _infixes
TOKENIZER_SUFFIXES = _suffixes

View File

@ -0,0 +1,46 @@
from ...symbols import NOUN, PROPN
from ...errors import Errors
def noun_chunks(doclike):
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# fmt: off
labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
# fmt: on
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels]
np_label = doc.vocab.strings.add("NP")
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN):
continue
# Prevent nested chunks from being produced
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
left = word.left_edge.i
right = word.right_edge.i + 1
# leave prepositions and punctuation out of the left side of the chunk
if word.left_edge.pos_ == "ADP" or word.left_edge.pos_ == "PUNCT":
left = word.left_edge.i + 1
prev_end = word.right_edge.i
# leave subordinated clauses and appositions out of the chunk
a = word.i + 1
while a < word.right_edge.i:
paraula = doc[a]
if paraula.pos_ == "VERB":
right = paraula.left_edge.i
prev_end = paraula.left_edge.i - 1
elif paraula.dep_ == "appos":
right = paraula.left_edge.i + 1
prev_end = paraula.left_edge.i - 1
a += 1
# leave punctuation out of the right side of the chunk
if word.right_edge.pos_ == "PUNCT":
right = right - 1
yield left, right, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -24,6 +24,13 @@ for exc_data in [
{ORTH: "núm", NORM: "número"},
{ORTH: "St.", NORM: "sant"},
{ORTH: "Sta.", NORM: "santa"},
{ORTH: "'l"},
{ORTH: "'ls"},
{ORTH: "'m"},
{ORTH: "'n"},
{ORTH: "'ns"},
{ORTH: "'s"},
{ORTH: "'t"},
]:
_exc[exc_data[ORTH]] = [exc_data]

View File

@ -72,7 +72,7 @@ steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua
subito successivamente successivo sue sugl sugli sui sul sull sulla sulle
sullo suo suoi
tale tali talvolta tanto te tempo ti titolo torino tra tranne tre trenta
tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta
troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto
uguali ulteriore ultimo un una uno uomo

View File

@ -1,8 +1,15 @@
from typing import Any, Dict, Union
from pathlib import Path
import re
import srsly
import string
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...tokens import Doc
from ...util import DummyTokenizer, registry, load_config_from_str
from ... import util
DEFAULT_CONFIG = """
@ -40,17 +47,108 @@ class VietnameseTokenizer(DummyTokenizer):
def __call__(self, text: str) -> Doc:
if self.use_pyvi:
words, spaces = self.ViTokenizer.spacy_tokenize(text)
words = self.pyvi_tokenize(text)
words, spaces = util.get_words_and_spaces(words, text)
return Doc(self.vocab, words=words, spaces=spaces)
else:
words = []
spaces = []
for token in self.tokenizer(text):
words.extend(list(token.text))
spaces.extend([False] * len(token.text))
spaces[-1] = bool(token.whitespace_)
words, spaces = util.get_words_and_spaces(text.split(), text)
return Doc(self.vocab, words=words, spaces=spaces)
# The methods pyvi_sylabelize_with_ws and pyvi_tokenize are adapted from
# pyvi v0.1, MIT License, Copyright (c) 2016 Viet-Trung Tran.
# See licenses/3rd_party_licenses.txt
def pyvi_sylabelize_with_ws(self, text):
"""Modified from pyvi to preserve whitespace and skip unicode
normalization."""
specials = [r"==>", r"->", r"\.\.\.", r">>"]
digit = r"\d+([\.,_]\d+)+"
email = r"([a-zA-Z0-9_.+-]+@([a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+)"
web = r"\w+://[^\s]+"
word = r"\w+"
non_word = r"[^\w\s]"
abbreviations = [
r"[A-ZĐ]+\.",
r"Tp\.",
r"Mr\.",
r"Mrs\.",
r"Ms\.",
r"Dr\.",
r"ThS\.",
]
patterns = []
patterns.extend(abbreviations)
patterns.extend(specials)
patterns.extend([web, email])
patterns.extend([digit, non_word, word])
patterns = r"(\s+|" + "|".join(patterns) + ")"
tokens = re.findall(patterns, text, re.UNICODE)
return [token[0] for token in tokens]
def pyvi_tokenize(self, text):
"""Modified from pyvi to preserve text and whitespace."""
if len(text) == 0:
return []
elif text.isspace():
return [text]
segs = self.pyvi_sylabelize_with_ws(text)
words = []
preceding_ws = []
for i, token in enumerate(segs):
if not token.isspace():
words.append(token)
preceding_ws.append(
"" if (i == 0 or not segs[i - 1].isspace()) else segs[i - 1]
)
labels = self.ViTokenizer.ViTokenizer.model.predict(
[self.ViTokenizer.ViTokenizer.sent2features(words, False)]
)
token = words[0]
tokens = []
for i in range(1, len(labels[0])):
if (
labels[0][i] == "I_W"
and words[i] not in string.punctuation
and words[i - 1] not in string.punctuation
and not words[i][0].isdigit()
and not words[i - 1][0].isdigit()
and not (words[i][0].istitle() and not words[i - 1][0].istitle())
):
token = token + preceding_ws[i] + words[i]
else:
tokens.append(token)
token = words[i]
tokens.append(token)
return tokens
def _get_config(self) -> Dict[str, Any]:
return {"use_pyvi": self.use_pyvi}
def _set_config(self, config: Dict[str, Any] = {}) -> None:
self.use_pyvi = config.get("use_pyvi", False)
def to_bytes(self, **kwargs) -> bytes:
serializers = {"cfg": lambda: srsly.json_dumps(self._get_config())}
return util.to_bytes(serializers, [])
def from_bytes(self, data: bytes, **kwargs) -> "VietnameseTokenizer":
deserializers = {"cfg": lambda b: self._set_config(srsly.json_loads(b))}
util.from_bytes(data, deserializers, [])
return self
def to_disk(self, path: Union[str, Path], **kwargs) -> None:
path = util.ensure_path(path)
serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
return util.to_disk(path, serializers, [])
def from_disk(self, path: Union[str, Path], **kwargs) -> "VietnameseTokenizer":
path = util.ensure_path(path)
serializers = {"cfg": lambda p: self._set_config(srsly.read_json(p))}
util.from_disk(path, serializers, [])
return self
class VietnameseDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)

View File

@ -690,7 +690,7 @@ class Language:
if self.vocab.vectors.shape != source.vocab.vectors.shape or \
self.vocab.vectors.key2row != source.vocab.vectors.key2row or \
self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes():
util.logger.warning(Warnings.W113.format(name=source_name))
warnings.warn(Warnings.W113.format(name=source_name))
if not source_name in source.component_names:
raise KeyError(
Errors.E944.format(
@ -1075,6 +1075,7 @@ class Language:
losses: Optional[Dict[str, float]] = None,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
exclude: Iterable[str] = SimpleFrozenList(),
annotates: Iterable[str] = SimpleFrozenList(),
):
"""Update the models in the pipeline.
@ -1082,10 +1083,13 @@ class Language:
_: Should not be set - serves to catch backwards-incompatible scripts.
drop (float): The dropout rate.
sgd (Optimizer): An optimizer.
losses (Dict[str, float]): Dictionary to update with the loss, keyed by component.
losses (Dict[str, float]): Dictionary to update with the loss, keyed by
component.
component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
components, keyed by component name.
exclude (Iterable[str]): Names of components that shouldn't be updated.
annotates (Iterable[str]): Names of components that should set
annotations on the predicted examples after updating.
RETURNS (Dict[str, float]): The updated losses dictionary
DOCS: https://spacy.io/api/language#update
@ -1104,15 +1108,16 @@ class Language:
sgd = self._optimizer
if component_cfg is None:
component_cfg = {}
pipe_kwargs = {}
for i, (name, proc) in enumerate(self.pipeline):
component_cfg.setdefault(name, {})
pipe_kwargs[name] = deepcopy(component_cfg[name])
component_cfg[name].setdefault("drop", drop)
pipe_kwargs[name].setdefault("batch_size", self.batch_size)
for name, proc in self.pipeline:
if name in exclude or not hasattr(proc, "update"):
continue
proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
if sgd not in (None, False):
for name, proc in self.pipeline:
if name not in exclude and hasattr(proc, "update"):
proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
if sgd not in (None, False):
if (
name not in exclude
and hasattr(proc, "is_trainable")
@ -1120,6 +1125,18 @@ class Language:
and proc.model not in (True, False, None)
):
proc.finish_update(sgd)
if name in annotates:
for doc, eg in zip(
_pipe(
(eg.predicted for eg in examples),
proc=proc,
name=name,
default_error_handler=self.default_error_handler,
kwargs=pipe_kwargs[name],
),
examples,
):
eg.predicted = doc
return losses
def rehearse(

View File

@ -4,6 +4,7 @@ from collections import defaultdict
from itertools import product
import numpy
import warnings
from .matcher cimport Matcher
from ..vocab cimport Vocab
@ -11,7 +12,6 @@ from ..tokens.doc cimport Doc
from ..errors import Errors, Warnings
from ..tokens import Span
from ..util import logger
DELIMITER = "||"
@ -282,7 +282,7 @@ cdef class DependencyMatcher:
keys_to_position_maps = defaultdict(lambda: defaultdict(list))
for match_id, start, end in self._matcher(doc):
if start + 1 != end:
logger.warning(Warnings.W110.format(tokens=[t.text for t in doc[start:end]], pattern=self._matcher.get(match_id)[1][0][0]))
warnings.warn(Warnings.W110.format(tokens=[t.text for t in doc[start:end]], pattern=self._matcher.get(match_id)[1][0][0]))
token = doc[start]
root = ([token] + list(token.ancestors))[-1]
keys_to_position_maps[root.i][match_id].append(start)

View File

@ -50,6 +50,8 @@ cdef class PhraseMatcher:
if isinstance(attr, (int, long)):
self.attr = attr
else:
if attr is None:
attr = "ORTH"
attr = attr.upper()
if attr == "TEXT":
attr = "ORTH"

View File

@ -1,14 +1,11 @@
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap, PreshMapArray
from libc.stdint cimport uint64_t
from murmurhash cimport mrmr
from preshed.maps cimport PreshMap
cimport numpy as np
from libc.stdint cimport uint64_t
from .structs cimport TokenC, MorphAnalysisC
from .structs cimport MorphAnalysisC
from .strings cimport StringStore
from .typedefs cimport hash_t, attr_t, flags_t
from .parts_of_speech cimport univ_pos_t
from . cimport symbols
from .typedefs cimport attr_t, hash_t
cdef class Morphology:
@ -16,14 +13,6 @@ cdef class Morphology:
cdef readonly StringStore strings
cdef PreshMap tags # Keyed by hash, value is pointer to tag
cdef public object lemmatizer
cdef readonly object tag_map
cdef readonly object tag_names
cdef readonly object reverse_index
cdef readonly object _exc
cdef readonly PreshMapArray _cache
cdef readonly int n_tags
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
cdef int insert(self, MorphAnalysisC tag) except -1

View File

@ -1,20 +1,11 @@
# cython: infer_types
from libc.string cimport memset
import srsly
from collections import Counter
import numpy
import warnings
from .attrs cimport POS, IS_SPACE
from .parts_of_speech cimport SPACE
from .lexeme cimport Lexeme
from .attrs cimport POS
from .strings import get_string_id
from .attrs import LEMMA, intify_attrs
from .parts_of_speech import IDS as POS_IDS
from .errors import Errors, Warnings
from .util import ensure_path
from .errors import Warnings
from . import symbols

View File

@ -481,7 +481,8 @@ class EntityLinker(TrainablePipe):
def load_model(p):
try:
self.model.from_bytes(p.open("rb").read())
with p.open("rb") as infile:
self.model.from_bytes(infile.read())
except AttributeError:
raise ValueError(Errors.E149) from None

View File

@ -102,17 +102,12 @@ class EntityRuler(Pipe):
self.overwrite = overwrite_ents
self.token_patterns = defaultdict(list)
self.phrase_patterns = defaultdict(list)
self._validate = validate
self.matcher = Matcher(nlp.vocab, validate=validate)
if phrase_matcher_attr is not None:
if phrase_matcher_attr.upper() == "TEXT":
phrase_matcher_attr = "ORTH"
self.phrase_matcher_attr = phrase_matcher_attr
self.phrase_matcher = PhraseMatcher(
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
)
else:
self.phrase_matcher_attr = None
self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
self.phrase_matcher_attr = phrase_matcher_attr
self.phrase_matcher = PhraseMatcher(
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
)
self.ent_id_sep = ent_id_sep
self._ent_ids = defaultdict(dict)
if patterns is not None:
@ -317,20 +312,27 @@ class EntityRuler(Pipe):
pattern = entry["pattern"]
if isinstance(pattern, Doc):
self.phrase_patterns[label].append(pattern)
self.phrase_matcher.add(label, [pattern])
elif isinstance(pattern, list):
self.token_patterns[label].append(pattern)
self.matcher.add(label, [pattern])
else:
raise ValueError(Errors.E097.format(pattern=pattern))
for label, patterns in self.token_patterns.items():
self.matcher.add(label, patterns)
for label, patterns in self.phrase_patterns.items():
self.phrase_matcher.add(label, patterns)
def clear(self) -> None:
"""Reset all patterns."""
self.token_patterns = defaultdict(list)
self.phrase_patterns = defaultdict(list)
self._ent_ids = defaultdict(dict)
self.matcher = Matcher(self.nlp.vocab, validate=self._validate)
self.phrase_matcher = PhraseMatcher(
self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
)
def _require_patterns(self) -> None:
"""Raise a warning if this component has no patterns defined."""
if len(self) == 0:
warnings.warn(Warnings.W036.format(name=self.name))
def _require_patterns(self) -> None:
"""Raise a warning if this component has no patterns defined."""
@ -381,10 +383,9 @@ class EntityRuler(Pipe):
self.add_patterns(cfg.get("patterns", cfg))
self.overwrite = cfg.get("overwrite", False)
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
if self.phrase_matcher_attr is not None:
self.phrase_matcher = PhraseMatcher(
self.nlp.vocab, attr=self.phrase_matcher_attr
)
self.phrase_matcher = PhraseMatcher(
self.nlp.vocab, attr=self.phrase_matcher_attr
)
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
else:
self.add_patterns(cfg)
@ -435,10 +436,9 @@ class EntityRuler(Pipe):
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
if self.phrase_matcher_attr is not None:
self.phrase_matcher = PhraseMatcher(
self.nlp.vocab, attr=self.phrase_matcher_attr
)
self.phrase_matcher = PhraseMatcher(
self.nlp.vocab, attr=self.phrase_matcher_attr
)
from_disk(path, deserializers_patterns, {})
return self

View File

@ -2,6 +2,8 @@ from typing import Optional, List, Dict, Any, Callable, Iterable, Union, Tuple
from thinc.api import Model
from pathlib import Path
import warnings
from .pipe import Pipe
from ..errors import Errors, Warnings
from ..language import Language
@ -182,7 +184,7 @@ class Lemmatizer(Pipe):
univ_pos = token.pos_.lower()
if univ_pos in ("", "eol", "space"):
if univ_pos == "":
logger.warning(Warnings.W108.format(text=string))
warnings.warn(Warnings.W108.format(text=string))
return [string.lower()]
# See Issue #435 for example of where this logic is requied.
if self.is_base_form(token):

View File

@ -324,7 +324,8 @@ cdef class TrainablePipe(Pipe):
def load_model(p):
try:
self.model.from_bytes(p.open("rb").read())
with open(p, "rb") as mfile:
self.model.from_bytes(mfile.read())
except AttributeError:
raise ValueError(Errors.E149) from None

View File

@ -313,6 +313,7 @@ class ConfigSchemaTraining(BaseModel):
optimizer: Optimizer = Field(..., title="The optimizer to use")
logger: Logger = Field(..., title="The logger to track training progress")
frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
annotating_components: List[str] = Field(..., title="Pipeline components that should set annotations during training")
before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk")
# fmt: on

View File

@ -293,6 +293,12 @@ def ur_tokenizer():
return get_lang_class("ur")().tokenizer
@pytest.fixture(scope="session")
def vi_tokenizer():
pytest.importorskip("pyvi")
return get_lang_class("vi")().tokenizer
@pytest.fixture(scope="session")
def yo_tokenizer():
return get_lang_class("yo")().tokenizer

View File

@ -2,8 +2,6 @@ import weakref
import pytest
import numpy
import logging
import mock
from spacy.lang.xx import MultiLanguage
from spacy.tokens import Doc, Span, Token
@ -158,13 +156,10 @@ def test_doc_api_serialize(en_tokenizer, text):
def inner_func(d1, d2):
return "hello!"
logger = logging.getLogger("spacy")
with mock.patch.object(logger, "warning") as mock_warning:
_ = tokens.to_bytes() # noqa: F841
mock_warning.assert_not_called()
_ = tokens.to_bytes() # noqa: F841
with pytest.warns(UserWarning):
tokens.user_hooks["similarity"] = inner_func
_ = tokens.to_bytes() # noqa: F841
mock_warning.assert_called_once()
def test_doc_api_set_ents(en_tokenizer):

View File

@ -12,13 +12,13 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""
tokens = ca_tokenizer(text)
assert len(tokens) == 138
assert len(tokens) == 140
@pytest.mark.parametrize(
"text,length",
[
("Perquè va anar-hi?", 6),
("Perquè va anar-hi?", 4),
("“Ah no?”", 5),
("""Sí! "Anem", va contestar el Joan Carles""", 11),
("Van córrer aprox. 10km", 5),

View File

@ -8,7 +8,7 @@ from spacy.util import get_lang_class
# Only include languages with no external dependencies
# excluded: ru, uk
# excluded for custom tables: es, pl
LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"]
LANGUAGES = ["bn", "ca", "el", "en", "fa", "fr", "nb", "nl", "sv"]
# fmt: on

View File

View File

@ -0,0 +1,33 @@
from spacy.lang.vi import Vietnamese
from ...util import make_tempdir
def test_vi_tokenizer_serialize(vi_tokenizer):
tokenizer_bytes = vi_tokenizer.to_bytes()
nlp = Vietnamese()
nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
assert nlp.tokenizer.use_pyvi is True
with make_tempdir() as d:
file_path = d / "tokenizer"
vi_tokenizer.to_disk(file_path)
nlp = Vietnamese()
nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
assert nlp.tokenizer.use_pyvi is True
# mode is (de)serialized correctly
nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
nlp_bytes = nlp.to_bytes()
nlp_r = Vietnamese()
nlp_r.from_bytes(nlp_bytes)
assert nlp_bytes == nlp_r.to_bytes()
assert nlp_r.tokenizer.use_pyvi == False
with make_tempdir() as d:
nlp.to_disk(d)
nlp_r = Vietnamese()
nlp_r.from_disk(d)
assert nlp_bytes == nlp_r.to_bytes()
assert nlp_r.tokenizer.use_pyvi == False

View File

@ -0,0 +1,47 @@
import pytest
from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
from spacy.lang.vi import Vietnamese
# fmt: off
TOKENIZER_TESTS = [
("Đây là một văn bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này", ['Đây', '', 'một', 'văn bản', 'bằng', 'tiếng', 'Việt', 'Sau', 'đó', ',', 'đây', '', 'một', 'văn bản', 'khác', 'bằng', 'ngôn ngữ', 'này']),
]
# fmt: on
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
def test_vi_tokenizer(vi_tokenizer, text, expected_tokens):
tokens = [token.text for token in vi_tokenizer(text)]
assert tokens == expected_tokens
def test_vi_tokenizer_extra_spaces(vi_tokenizer):
# note: three spaces after "I"
tokens = vi_tokenizer("I like cheese.")
assert tokens[1].orth_ == " "
@pytest.mark.parametrize("text", NAUGHTY_STRINGS)
def test_vi_tokenizer_naughty_strings(vi_tokenizer, text):
tokens = vi_tokenizer(text)
assert tokens.text_with_ws == text
def test_vi_tokenizer_emptyish_texts(vi_tokenizer):
doc = vi_tokenizer("")
assert len(doc) == 0
doc = vi_tokenizer(" ")
assert len(doc) == 1
doc = vi_tokenizer("\n\n\n \t\t \n\n\n")
assert len(doc) == 1
def test_vi_tokenizer_no_pyvi():
"""Test for whitespace tokenization without pyvi"""
nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
text = "Đây là một văn bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này"
doc = nlp(text)
assert [t.text for t in doc if not t.is_space] == text.split()
assert doc[4].text == " "

View File

@ -252,12 +252,12 @@ def test_ruler_before_ner():
# 1 : Entity Ruler - should set "this" to B and everything else to empty
patterns = [{"label": "THING", "pattern": "This"}]
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)
# 2: untrained NER - should set everything else to O
untrained_ner = nlp.add_pipe("ner")
untrained_ner.add_label("MY_LABEL")
nlp.initialize()
ruler.add_patterns(patterns)
doc = nlp("This is Antti Korhonen speaking in Finland")
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
expected_types = ["THING", "", "", "", "", "", ""]

View File

@ -0,0 +1,113 @@
from typing import Callable, Iterable, Iterator
import pytest
import io
from thinc.api import Config
from spacy.language import Language
from spacy.training import Example
from spacy.training.loop import train
from spacy.lang.en import English
from spacy.util import registry, load_model_from_config
@pytest.fixture
def config_str():
return """
[nlp]
lang = "en"
pipeline = ["sentencizer","assert_sents"]
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
batch_size = 1000
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
[components]
[components.assert_sents]
factory = "assert_sents"
[components.sentencizer]
factory = "sentencizer"
punct_chars = null
[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
annotating_components = ["sentencizer"]
max_steps = 2
[corpora]
[corpora.dev]
@readers = "unannotated_corpus"
[corpora.train]
@readers = "unannotated_corpus"
"""
def test_annotates_on_update():
# The custom component checks for sentence annotation
@Language.factory("assert_sents", default_config={})
def assert_sents(nlp, name):
return AssertSents(name)
class AssertSents:
def __init__(self, name, **cfg):
self.name = name
pass
def __call__(self, doc):
if not doc.has_annotation("SENT_START"):
raise ValueError("No sents")
return doc
def update(self, examples, *, drop=0.0, sgd=None, losses=None):
for example in examples:
if not example.predicted.has_annotation("SENT_START"):
raise ValueError("No sents")
return {}
nlp = English()
nlp.add_pipe("sentencizer")
nlp.add_pipe("assert_sents")
# When the pipeline runs, annotations are set
doc = nlp("This is a sentence.")
examples = []
for text in ["a a", "b b", "c c"]:
examples.append(Example(nlp.make_doc(text), nlp(text)))
for example in examples:
assert not example.predicted.has_annotation("SENT_START")
# If updating without setting annotations, assert_sents will raise an error
with pytest.raises(ValueError):
nlp.update(examples)
# Updating while setting annotations for the sentencizer succeeds
nlp.update(examples, annotates=["sentencizer"])
def test_annotating_components_from_config(config_str):
@registry.readers("unannotated_corpus")
def create_unannotated_corpus() -> Callable[[Language], Iterable[Example]]:
return UnannotatedCorpus()
class UnannotatedCorpus:
def __call__(self, nlp: Language) -> Iterator[Example]:
for text in ["a a", "b b", "c c"]:
doc = nlp.make_doc(text)
yield Example(doc, doc)
orig_config = Config().from_str(config_str)
nlp = load_model_from_config(orig_config, auto_fill=True, validate=True)
assert nlp.config["training"]["annotating_components"] == ["sentencizer"]
train(nlp)
nlp.config["training"]["annotating_components"] = []
with pytest.raises(ValueError):
train(nlp)

View File

@ -89,6 +89,19 @@ def test_entity_ruler_init_clear(nlp, patterns):
assert len(ruler.labels) == 0
def test_entity_ruler_clear(nlp, patterns):
"""Test that initialization clears patterns."""
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)
assert len(ruler.labels) == 4
doc = nlp("hello world")
assert len(doc.ents) == 1
ruler.clear()
assert len(ruler.labels) == 0
doc = nlp("hello world")
assert len(doc.ents) == 0
def test_entity_ruler_existing(nlp, patterns):
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)

View File

@ -1,6 +1,4 @@
import pytest
import logging
import mock
import pickle
from spacy import util, registry
from spacy.lang.en import English
@ -59,10 +57,10 @@ def test_lemmatizer_config(nlp):
# warning if no POS assigned
doc = nlp.make_doc("coping")
logger = logging.getLogger("spacy")
with mock.patch.object(logger, "warning") as mock_warning:
with pytest.warns(UserWarning):
doc = lemmatizer(doc)
mock_warning.assert_called_once()
# warns once by default
doc = lemmatizer(doc)
# works with POS
doc = nlp.make_doc("coping")

View File

@ -1,6 +1,4 @@
import pytest
import mock
import logging
from spacy.language import Language
from spacy.lang.en import English
from spacy.lang.de import German
@ -334,24 +332,31 @@ def test_language_factories_invalid():
@pytest.mark.parametrize(
"weights,expected",
"weights,override,expected",
[
([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {"a": 0.33, "b": 0.33, "c": 0.33}),
([{"a": 1.0}, {"b": 50}, {"c": 123}], {"a": 0.33, "b": 0.33, "c": 0.33}),
([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {}, {"a": 0.33, "b": 0.33, "c": 0.33}),
([{"a": 1.0}, {"b": 50}, {"c": 100}], {}, {"a": 0.01, "b": 0.33, "c": 0.66}),
(
[{"a": 0.7, "b": 0.3}, {"c": 1.0}, {"d": 0.5, "e": 0.5}],
{},
{"a": 0.23, "b": 0.1, "c": 0.33, "d": 0.17, "e": 0.17},
),
(
[{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
{"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
[{"a": 100, "b": 300}, {"c": 50, "d": 50}],
{},
{"a": 0.2, "b": 0.6, "c": 0.1, "d": 0.1},
),
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75}),
([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"a": 0.0, "b": 0.0, "c": 0.0}),
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {}, {"a": 0.33, "b": 0.67}),
([{"a": 0.5, "b": 0.0}], {}, {"a": 1.0, "b": 0.0}),
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.0}, {"a": 0.0, "b": 1.0}),
([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {}, {"a": 0.0, "b": 0.0, "c": 0.0}),
([{"a": 0.0, "b": 0.0}, {"c": 1.0}], {}, {"a": 0.0, "b": 0.0, "c": 1.0}),
([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"c": 0.2}, {"a": 0.0, "b": 0.0, "c": 1.0}),
([{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], {"a": 0.0, "b": 0.0}, {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5}),
],
)
def test_language_factories_combine_score_weights(weights, expected):
result = combine_score_weights(weights)
def test_language_factories_combine_score_weights(weights, override, expected):
result = combine_score_weights(weights, override)
assert sum(result.values()) in (0.99, 1.0, 0.0)
assert result == expected
@ -377,17 +382,17 @@ def test_language_factories_scores():
# Test with custom defaults
config = nlp.config.copy()
config["training"]["score_weights"]["a1"] = 0.0
config["training"]["score_weights"]["b3"] = 1.0
config["training"]["score_weights"]["b3"] = 1.3
nlp = English.from_config(config)
score_weights = nlp.config["training"]["score_weights"]
expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34}
expected = {"a1": 0.0, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.65}
assert score_weights == expected
# Test with null values
config = nlp.config.copy()
config["training"]["score_weights"]["a1"] = None
nlp = English.from_config(config)
score_weights = nlp.config["training"]["score_weights"]
expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35}
expected = {"a1": None, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.66}
assert score_weights == expected
@ -430,10 +435,8 @@ def test_pipe_factories_from_source_language_subclass():
nlp = English()
nlp.vocab.vectors.resize((1, 4))
nlp.vocab.vectors.add("cat", vector=[1, 2, 3, 4])
logger = logging.getLogger("spacy")
with mock.patch.object(logger, "warning") as mock_warning:
with pytest.warns(UserWarning):
nlp.add_pipe("tagger", source=source_nlp)
mock_warning.assert_called()
def test_pipe_factories_from_source_custom():

View File

@ -1,7 +1,9 @@
import pytest
from spacy.language import Language
from spacy.pipeline import TrainablePipe
from spacy.training import Example
from spacy.util import SimpleFrozenList, get_arg_names
from spacy.lang.en import English
@pytest.fixture
@ -417,3 +419,41 @@ def test_pipe_methods_initialize():
assert "test" in nlp.config["initialize"]["components"]
nlp.remove_pipe("test")
assert "test" not in nlp.config["initialize"]["components"]
def test_update_with_annotates():
name = "test_with_annotates"
results = {}
def make_component(name):
results[name] = ""
def component(doc):
nonlocal results
results[name] += doc.text
return doc
return component
c1 = Language.component(f"{name}1", func=make_component(f"{name}1"))
c2 = Language.component(f"{name}2", func=make_component(f"{name}2"))
components = set([f"{name}1", f"{name}2"])
nlp = English()
texts = ["a", "bb", "ccc"]
examples = []
for text in texts:
examples.append(Example(nlp.make_doc(text), nlp.make_doc(text)))
for components_to_annotate in [[], [f"{name}1"], [f"{name}1", f"{name}2"], [f"{name}2", f"{name}1"]]:
for key in results:
results[key] = ""
nlp = English(vocab=nlp.vocab)
nlp.add_pipe(f"{name}1")
nlp.add_pipe(f"{name}2")
nlp.update(examples, annotates=components_to_annotate)
for component in components_to_annotate:
assert results[component] == "".join(eg.predicted.text for eg in examples)
for component in components - set(components_to_annotate):
assert results[component] == ""

View File

@ -0,0 +1,34 @@
import pytest
from spacy import registry
from spacy.language import Language
from spacy.pipeline import EntityRuler
@pytest.fixture
def nlp():
return Language()
@pytest.fixture
@registry.misc("entity_ruler_patterns")
def patterns():
return [
{"label": "HELLO", "pattern": "hello world"},
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
{"label": "TECH_ORG", "pattern": "Microsoft", "id": "a2"},
]
def test_entity_ruler_fix8216(nlp, patterns):
"""Test that patterns don't get added excessively."""
ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
ruler.add_patterns(patterns)
pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
assert pattern_count > 0
ruler.add_patterns([])
after_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
assert after_count == pattern_count

View File

@ -84,7 +84,8 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n
@pytest.mark.parametrize("file_name", ["sun.txt"])
def test_tokenizer_handle_text_from_file(tokenizer, file_name):
loc = ensure_path(__file__).parent / file_name
text = loc.open("r", encoding="utf8").read()
with loc.open("r", encoding="utf8") as infile:
text = infile.read()
assert len(text) != 0
tokens = tokenizer(text)
assert len(tokens) > 100

View File

@ -14,7 +14,7 @@ cdef class Tokenizer:
cdef Pool mem
cdef PreshMap _cache
cdef PreshMap _specials
cpdef readonly Vocab vocab
cdef readonly Vocab vocab
cdef object _token_match
cdef object _url_match

View File

@ -1321,7 +1321,7 @@ cdef class Doc:
if "user_data_values" not in exclude:
serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
if "user_hooks" not in exclude and any((self.user_hooks, self.user_token_hooks, self.user_span_hooks)):
util.logger.warning(Warnings.W109)
warnings.warn(Warnings.W109)
return util.to_dict(serializers, exclude)
def from_dict(self, msg, *, exclude=tuple()):

View File

@ -74,6 +74,8 @@ def train(
# Components that shouldn't be updated during training
frozen_components = T["frozen_components"]
# Components that should set annotations on update
annotating_components = T["annotating_components"]
# Create iterator, which yields out info after each optimization step.
training_step_iterator = train_while_improving(
nlp,
@ -86,11 +88,17 @@ def train(
max_steps=T["max_steps"],
eval_frequency=T["eval_frequency"],
exclude=frozen_components,
annotating_components=annotating_components,
)
clean_output_dir(output_path)
stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n")
if frozen_components:
stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
if annotating_components:
stdout.write(
msg.info(f"Set annotations on update for: {annotating_components}")
+ "\n"
)
stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n")
with nlp.select_pipes(disable=frozen_components):
log_step, finalize_logger = train_logger(nlp, stdout, stderr)
@ -142,6 +150,7 @@ def train_while_improving(
patience: int,
max_steps: int,
exclude: List[str],
annotating_components: List[str],
):
"""Train until an evaluation stops improving. Works as a generator,
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
@ -193,7 +202,12 @@ def train_while_improving(
dropout = next(dropouts)
for subbatch in subdivide_batch(batch, accumulate_gradient):
nlp.update(
subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
subbatch,
drop=dropout,
losses=losses,
sgd=False,
exclude=exclude,
annotates=annotating_components,
)
# TODO: refactor this so we don't have to run it separately in here
for name, proc in nlp.pipeline:

View File

@ -1370,32 +1370,14 @@ def combine_score_weights(
should be preserved.
RETURNS (Dict[str, float]): The combined and normalized weights.
"""
# We divide each weight by the total weight sum.
# We first need to extract all None/null values for score weights that
# shouldn't be shown in the table *or* be weighted
result = {}
all_weights = []
for w_dict in weights:
filtered_weights = {}
for key, value in w_dict.items():
value = overrides.get(key, value)
if value is None:
result[key] = None
else:
filtered_weights[key] = value
all_weights.append(filtered_weights)
for w_dict in all_weights:
# We need to account for weights that don't sum to 1.0 and normalize
# the score weights accordingly, then divide score by the number of
# components.
total = sum(w_dict.values())
for key, value in w_dict.items():
if total == 0:
weight = 0.0
else:
weight = round(value / total / len(all_weights), 2)
prev_weight = result.get(key, 0.0)
prev_weight = 0.0 if prev_weight is None else prev_weight
result[key] = prev_weight + weight
result = {key: overrides.get(key, value) for w_dict in weights for (key, value) in w_dict.items()}
weight_sum = sum([v if v else 0.0 for v in result.values()])
for key, value in result.items():
if value and weight_sum > 0:
result[key] = round(value / weight_sum, 2)
return result

View File

@ -25,12 +25,12 @@ cdef struct _Cached:
cdef class Vocab:
cdef Pool mem
cpdef readonly StringStore strings
cpdef public Morphology morphology
cpdef public object vectors
cpdef public object _lookups
cpdef public object writing_system
cpdef public object get_noun_chunks
cdef readonly StringStore strings
cdef public Morphology morphology
cdef public object vectors
cdef public object _lookups
cdef public object writing_system
cdef public object get_noun_chunks
cdef readonly int length
cdef public object data_dir
cdef public object lex_attr_getters

View File

@ -182,24 +182,25 @@ single corpus once and then divide it up into `train` and `dev` partitions.
This section defines settings and controls for the training and evaluation
process that are used when you run [`spacy train`](/api/cli#train).
| Name | Description |
| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ |
| `logger` | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
| `max_epochs` | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~ |
| `max_steps` | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~ |
| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
| `patience` | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~ |
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ |
| Name | Description |
| ----------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
| `annotating_components` | Pipeline component names that should set annotations on the predicted docs during training. See [here](/usage/training#annotating-components) for details. Defaults to `[]`. ~~List[str]~~ |
| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ |
| `logger` | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
| `max_epochs` | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~ |
| `max_steps` | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~ |
| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
| `patience` | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~ |
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ |
### pretraining {#config-pretraining tag="section,optional"}

View File

@ -245,14 +245,14 @@ and call the optimizer, while the others simply increment the gradients.
> losses = trf.update(examples, sgd=optimizer)
> ```
| Name | Description |
| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
| Name | Description |
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## Transformer.create_optimizer {#create_optimizer tag="method"}
@ -493,6 +493,11 @@ This requires sentence boundaries to be set (e.g. by the
depending on the sentence lengths. However, it does provide the transformer with
more meaningful windows to attend over.
To set sentence boundaries with the `sentencizer` during training, add a
`sentencizer` to the beginning of the pipeline and include it in
[`[training.annotating_components]`](/usage/training#annotating-components) to
have it set the sentence boundaries before the `transformer` component runs.
### strided_spans.v1 {#strided_spans tag="registered function"}
> #### Example config

View File

@ -422,11 +422,11 @@ as-is. They are also excluded when calling
> #### Note on frozen components
>
> Even though frozen components are not **updated** during training, they will
> still **run** during training and evaluation. This is very important, because
> they may still impact your model's performance for instance, a sentence
> boundary detector can impact what the parser or entity recognizer considers a
> valid parse. So the evaluation results should always reflect what your
> pipeline will produce at runtime.
> still **run** during evaluation. This is very important, because they may
> still impact your model's performance for instance, a sentence boundary
> detector can impact what the parser or entity recognizer considers a valid
> parse. So the evaluation results should always reflect what your pipeline will
> produce at runtime.
```ini
[nlp]
@ -463,6 +463,64 @@ replace_listeners = ["model.tok2vec"]
</Infobox>
### Using predictions from preceding components {#annotating-components new="3.1"}
By default, components are updated in isolation during training, which means
that they don't see the predictions of any earlier components in the pipeline. A
component receives [`Example.predicted`](/api/example) as input and compares its
predictions to [`Example.reference`](/api/example) without saving its
annotations in the `predicted` doc.
Instead, if certain components should **set their annotations** during training,
use the setting `annotating_components` in the `[training]` block to specify a
list of components. For example, the feature `DEP` from the parser could be used
as a tagger feature by including `DEP` in the tok2vec `attrs` and including
`parser` in `annotating_components`:
```ini
### config.cfg (excerpt) {highlight="7,12"}
[nlp]
pipeline = ["parser", "tagger"]
[components.tagger.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tagger.model.tok2vec.encode.width}
attrs = ["NORM","DEP"]
rows = [5000,2500]
include_static_vectors = false
[training]
annotating_components = ["parser"]
```
Any component in the pipeline can be included as an annotating component,
including frozen components. Frozen components can set annotations during
training just as they would set annotations during evaluation or when the final
pipeline is run. The config excerpt below shows how a frozen `ner` component and
a `sentencizer` can provide the required `doc.sents` and `doc.ents` for the
entity linker during training:
```ini
### config.cfg (excerpt)
[nlp]
pipeline = ["sentencizer", "ner", "entity_linker"]
[components.ner]
source = "en_core_web_sm"
[training]
frozen_components = ["ner"]
annotating_components = ["sentencizer", "ner"]
```
<Infobox variant="warning" title="Training speed with annotating components" id="annotating-components-speed">
Be aware that non-frozen annotating components with statistical models will
**run twice** on each batch, once to update the model and once to apply the
now-updated model to the predicted docs.
</Infobox>
### Using registered functions {#config-functions}
The training configuration defined in the config file doesn't have to only

View File

@ -25,7 +25,13 @@
"code": "ca",
"name": "Catalan",
"example": "Això és una frase.",
"has_examples": true
"has_examples": true,
"models": [
"ca_core_news_sm",
"ca_core_news_md",
"ca_core_news_lg",
"ca_core_news_trf"
]
},
{
"code": "cs",
@ -40,7 +46,8 @@
"models": [
"da_core_news_sm",
"da_core_news_md",
"da_core_news_lg"
"da_core_news_lg",
"da_core_news_trf"
]
},
{