mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge remote-tracking branch 'upstream/develop' into chore/develop-into-master-v3.1
This commit is contained in:
commit
5646fcbe46
|
@ -8,3 +8,4 @@ recursive-exclude spacy/lang *.json
|
||||||
recursive-include spacy/lang *.json.gz
|
recursive-include spacy/lang *.json.gz
|
||||||
recursive-include spacy/cli *.json *.yml
|
recursive-include spacy/cli *.json *.yml
|
||||||
recursive-include licenses *
|
recursive-include licenses *
|
||||||
|
recursive-exclude spacy *.cpp
|
||||||
|
|
|
@ -43,8 +43,8 @@ scikit-learn
|
||||||
|
|
||||||
* Files: scorer.py
|
* Files: scorer.py
|
||||||
|
|
||||||
The following implementation of roc_auc_score() is adapted from
|
The implementation of roc_auc_score() is adapted from scikit-learn, which is
|
||||||
scikit-learn, which is distributed under the following license:
|
distributed under the following license:
|
||||||
|
|
||||||
New BSD License
|
New BSD License
|
||||||
|
|
||||||
|
@ -77,3 +77,30 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||||
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
|
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
|
||||||
DAMAGE.
|
DAMAGE.
|
||||||
|
|
||||||
|
|
||||||
|
pyvi
|
||||||
|
----
|
||||||
|
|
||||||
|
* Files: lang/vi/__init__.py
|
||||||
|
|
||||||
|
The MIT License (MIT)
|
||||||
|
Copyright (c) 2016 Viet-Trung Tran
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||||
|
this software and associated documentation files (the "Software"), to deal in
|
||||||
|
the Software without restriction, including without limitation the rights to
|
||||||
|
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||||
|
of the Software, and to permit persons to whom the Software is furnished to do
|
||||||
|
so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
|
|
|
@ -68,7 +68,7 @@ console_scripts =
|
||||||
|
|
||||||
[options.extras_require]
|
[options.extras_require]
|
||||||
lookups =
|
lookups =
|
||||||
spacy_lookups_data>=1.0.0,<1.1.0
|
spacy_lookups_data>=1.0.1,<1.1.0
|
||||||
transformers =
|
transformers =
|
||||||
spacy_transformers>=1.0.1,<1.1.0
|
spacy_transformers>=1.0.1,<1.1.0
|
||||||
ray =
|
ray =
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.0.6"
|
__version__ = "3.1.0.dev0"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -115,7 +115,8 @@ def convert(
|
||||||
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
||||||
doc_files = []
|
doc_files = []
|
||||||
for input_loc in walk_directory(Path(input_path), converter):
|
for input_loc in walk_directory(Path(input_path), converter):
|
||||||
input_data = input_loc.open("r", encoding="utf-8").read()
|
with input_loc.open("r", encoding="utf-8") as infile:
|
||||||
|
input_data = infile.read()
|
||||||
# Use converter function to convert data
|
# Use converter function to convert data
|
||||||
func = CONVERTERS[converter]
|
func = CONVERTERS[converter]
|
||||||
docs = func(
|
docs = func(
|
||||||
|
|
|
@ -112,7 +112,9 @@ def package(
|
||||||
msg.fail("Invalid pipeline meta.json")
|
msg.fail("Invalid pipeline meta.json")
|
||||||
print("\n".join(errors))
|
print("\n".join(errors))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
model_name = meta["lang"] + "_" + meta["name"]
|
model_name = meta["name"]
|
||||||
|
if not model_name.startswith(meta["lang"] + "_"):
|
||||||
|
model_name = f"{meta['lang']}_{model_name}"
|
||||||
model_name_v = model_name + "-" + meta["version"]
|
model_name_v = model_name + "-" + meta["version"]
|
||||||
main_path = output_dir / model_name_v
|
main_path = output_dir / model_name_v
|
||||||
package_path = main_path / model_name
|
package_path = main_path / model_name
|
||||||
|
@ -128,9 +130,10 @@ def package(
|
||||||
)
|
)
|
||||||
Path.mkdir(package_path, parents=True)
|
Path.mkdir(package_path, parents=True)
|
||||||
shutil.copytree(str(input_dir), str(package_path / model_name_v))
|
shutil.copytree(str(input_dir), str(package_path / model_name_v))
|
||||||
license_path = package_path / model_name_v / "LICENSE"
|
for file_name in FILENAMES_DOCS:
|
||||||
if license_path.exists():
|
file_path = package_path / model_name_v / file_name
|
||||||
shutil.move(str(license_path), str(main_path))
|
if file_path.exists():
|
||||||
|
shutil.move(str(file_path), str(main_path))
|
||||||
imports = []
|
imports = []
|
||||||
for code_path in code_paths:
|
for code_path in code_paths:
|
||||||
imports.append(code_path.stem)
|
imports.append(code_path.stem)
|
||||||
|
@ -294,7 +297,7 @@ def setup_package():
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
setup_package()
|
setup_package()
|
||||||
""".strip()
|
""".lstrip()
|
||||||
|
|
||||||
|
|
||||||
TEMPLATE_MANIFEST = """
|
TEMPLATE_MANIFEST = """
|
||||||
|
@ -314,4 +317,7 @@ __version__ = get_model_meta(Path(__file__).parent)['version']
|
||||||
|
|
||||||
def load(**overrides):
|
def load(**overrides):
|
||||||
return load_model_from_init_py(__file__, **overrides)
|
return load_model_from_init_py(__file__, **overrides)
|
||||||
""".strip()
|
""".lstrip()
|
||||||
|
|
||||||
|
|
||||||
|
FILENAMES_DOCS = ["LICENSE", "LICENSES_SOURCES", "README.md"]
|
||||||
|
|
|
@ -372,7 +372,7 @@ factory = "{{ pipe }}"
|
||||||
[corpora.train]
|
[corpora.train]
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
path = ${paths.train}
|
path = ${paths.train}
|
||||||
max_length = {{ 500 if hardware == "gpu" else 2000 }}
|
max_length = 0
|
||||||
|
|
||||||
[corpora.dev]
|
[corpora.dev]
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
|
|
|
@ -80,6 +80,8 @@ eval_frequency = 200
|
||||||
score_weights = {}
|
score_weights = {}
|
||||||
# Names of pipeline components that shouldn't be updated during training
|
# Names of pipeline components that shouldn't be updated during training
|
||||||
frozen_components = []
|
frozen_components = []
|
||||||
|
# Names of pipeline components that should set annotations during training
|
||||||
|
annotating_components = []
|
||||||
# Location in the config where the dev corpus is defined
|
# Location in the config where the dev corpus is defined
|
||||||
dev_corpus = "corpora.dev"
|
dev_corpus = "corpora.dev"
|
||||||
# Location in the config where the train corpus is defined
|
# Location in the config where the train corpus is defined
|
||||||
|
|
|
@ -24,6 +24,9 @@ def setup_default_warnings():
|
||||||
for pipe in ["matcher", "entity_ruler"]:
|
for pipe in ["matcher", "entity_ruler"]:
|
||||||
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
|
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
|
||||||
|
|
||||||
|
# warn once about lemmatizer without required POS
|
||||||
|
filter_warning("once", error_msg="[W108]")
|
||||||
|
|
||||||
|
|
||||||
def filter_warning(action: str, error_msg: str):
|
def filter_warning(action: str, error_msg: str):
|
||||||
"""Customize how spaCy should handle a certain warning.
|
"""Customize how spaCy should handle a certain warning.
|
||||||
|
|
|
@ -28,7 +28,7 @@ cdef class Candidate:
|
||||||
|
|
||||||
cdef class KnowledgeBase:
|
cdef class KnowledgeBase:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cpdef readonly Vocab vocab
|
cdef readonly Vocab vocab
|
||||||
cdef int64_t entity_vector_length
|
cdef int64_t entity_vector_length
|
||||||
|
|
||||||
# This maps 64bit keys (hash of unique entity string)
|
# This maps 64bit keys (hash of unique entity string)
|
||||||
|
|
|
@ -1,15 +1,23 @@
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from .lemmatizer import CatalanLemmatizer
|
||||||
|
|
||||||
|
|
||||||
class CatalanDefaults(Language.Defaults):
|
class CatalanDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class Catalan(Language):
|
class Catalan(Language):
|
||||||
|
@ -17,4 +25,16 @@ class Catalan(Language):
|
||||||
Defaults = CatalanDefaults
|
Defaults = CatalanDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@Catalan.factory(
|
||||||
|
"lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)
|
||||||
|
def make_lemmatizer(
|
||||||
|
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||||
|
):
|
||||||
|
return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Catalan"]
|
__all__ = ["Catalan"]
|
||||||
|
|
81
spacy/lang/ca/lemmatizer.py
Normal file
81
spacy/lang/ca/lemmatizer.py
Normal file
|
@ -0,0 +1,81 @@
|
||||||
|
from typing import List, Tuple
|
||||||
|
|
||||||
|
from ...pipeline import Lemmatizer
|
||||||
|
from ...tokens import Token
|
||||||
|
|
||||||
|
|
||||||
|
class CatalanLemmatizer(Lemmatizer):
|
||||||
|
"""
|
||||||
|
Copied from French Lemmatizer
|
||||||
|
Catalan language lemmatizer applies the default rule based lemmatization
|
||||||
|
procedure with some modifications for better Catalan language support.
|
||||||
|
|
||||||
|
The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use
|
||||||
|
the rule-based lemmatization. As a last resort, the lemmatizer checks in
|
||||||
|
the lookup table.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
||||||
|
if mode == "rule":
|
||||||
|
required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
||||||
|
return (required, [])
|
||||||
|
else:
|
||||||
|
return super().get_lookups_config(mode)
|
||||||
|
|
||||||
|
def rule_lemmatize(self, token: Token) -> List[str]:
|
||||||
|
cache_key = (token.orth, token.pos)
|
||||||
|
if cache_key in self.cache:
|
||||||
|
return self.cache[cache_key]
|
||||||
|
string = token.text
|
||||||
|
univ_pos = token.pos_.lower()
|
||||||
|
if univ_pos in ("", "eol", "space"):
|
||||||
|
return [string.lower()]
|
||||||
|
elif "lemma_rules" not in self.lookups or univ_pos not in (
|
||||||
|
"noun",
|
||||||
|
"verb",
|
||||||
|
"adj",
|
||||||
|
"adp",
|
||||||
|
"adv",
|
||||||
|
"aux",
|
||||||
|
"cconj",
|
||||||
|
"det",
|
||||||
|
"pron",
|
||||||
|
"punct",
|
||||||
|
"sconj",
|
||||||
|
):
|
||||||
|
return self.lookup_lemmatize(token)
|
||||||
|
index_table = self.lookups.get_table("lemma_index", {})
|
||||||
|
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||||
|
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||||
|
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||||
|
index = index_table.get(univ_pos, {})
|
||||||
|
exceptions = exc_table.get(univ_pos, {})
|
||||||
|
rules = rules_table.get(univ_pos, [])
|
||||||
|
string = string.lower()
|
||||||
|
forms = []
|
||||||
|
if string in index:
|
||||||
|
forms.append(string)
|
||||||
|
self.cache[cache_key] = forms
|
||||||
|
return forms
|
||||||
|
forms.extend(exceptions.get(string, []))
|
||||||
|
oov_forms = []
|
||||||
|
if not forms:
|
||||||
|
for old, new in rules:
|
||||||
|
if string.endswith(old):
|
||||||
|
form = string[: len(string) - len(old)] + new
|
||||||
|
if not form:
|
||||||
|
pass
|
||||||
|
elif form in index or not form.isalpha():
|
||||||
|
forms.append(form)
|
||||||
|
else:
|
||||||
|
oov_forms.append(form)
|
||||||
|
if not forms:
|
||||||
|
forms.extend(oov_forms)
|
||||||
|
if not forms and string in lookup_table.keys():
|
||||||
|
forms.append(self.lookup_lemmatize(token)[0])
|
||||||
|
if not forms:
|
||||||
|
forms.append(string)
|
||||||
|
forms = list(set(forms))
|
||||||
|
self.cache[cache_key] = forms
|
||||||
|
return forms
|
|
@ -1,12 +1,46 @@
|
||||||
from ..punctuation import TOKENIZER_INFIXES
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||||
from ..char_classes import ALPHA
|
from ..char_classes import CURRENCY
|
||||||
|
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
||||||
|
from ..char_classes import merge_chars, _units
|
||||||
|
|
||||||
|
|
||||||
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
||||||
|
|
||||||
|
|
||||||
_infixes = TOKENIZER_INFIXES + [
|
_infixes = (
|
||||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
|
LIST_ELLIPSES
|
||||||
]
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||||
|
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||||
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||||
|
),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
_units = _units.replace("% ", "")
|
||||||
|
UNITS = merge_chars(_units)
|
||||||
|
|
||||||
|
_suffixes = (
|
||||||
|
LIST_PUNCT
|
||||||
|
+ LIST_ELLIPSES
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [r"-", "—", "–"]
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])\+",
|
||||||
|
r"(?<=°[FfCcKk])\.",
|
||||||
|
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||||
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
|
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
|
||||||
|
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
|
||||||
|
),
|
||||||
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
TOKENIZER_INFIXES = _infixes
|
TOKENIZER_INFIXES = _infixes
|
||||||
|
TOKENIZER_SUFFIXES = _suffixes
|
||||||
|
|
46
spacy/lang/ca/syntax_iterators.py
Normal file
46
spacy/lang/ca/syntax_iterators.py
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
from ...symbols import NOUN, PROPN
|
||||||
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
|
def noun_chunks(doclike):
|
||||||
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
|
# fmt: off
|
||||||
|
labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||||
|
# fmt: on
|
||||||
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
if not doc.has_annotation("DEP"):
|
||||||
|
raise ValueError(Errors.E029)
|
||||||
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
|
np_label = doc.vocab.strings.add("NP")
|
||||||
|
prev_end = -1
|
||||||
|
for i, word in enumerate(doclike):
|
||||||
|
if word.pos not in (NOUN, PROPN):
|
||||||
|
continue
|
||||||
|
# Prevent nested chunks from being produced
|
||||||
|
if word.left_edge.i <= prev_end:
|
||||||
|
continue
|
||||||
|
if word.dep in np_deps:
|
||||||
|
left = word.left_edge.i
|
||||||
|
right = word.right_edge.i + 1
|
||||||
|
# leave prepositions and punctuation out of the left side of the chunk
|
||||||
|
if word.left_edge.pos_ == "ADP" or word.left_edge.pos_ == "PUNCT":
|
||||||
|
left = word.left_edge.i + 1
|
||||||
|
prev_end = word.right_edge.i
|
||||||
|
# leave subordinated clauses and appositions out of the chunk
|
||||||
|
a = word.i + 1
|
||||||
|
while a < word.right_edge.i:
|
||||||
|
paraula = doc[a]
|
||||||
|
if paraula.pos_ == "VERB":
|
||||||
|
right = paraula.left_edge.i
|
||||||
|
prev_end = paraula.left_edge.i - 1
|
||||||
|
elif paraula.dep_ == "appos":
|
||||||
|
right = paraula.left_edge.i + 1
|
||||||
|
prev_end = paraula.left_edge.i - 1
|
||||||
|
a += 1
|
||||||
|
# leave punctuation out of the right side of the chunk
|
||||||
|
if word.right_edge.pos_ == "PUNCT":
|
||||||
|
right = right - 1
|
||||||
|
yield left, right, np_label
|
||||||
|
|
||||||
|
|
||||||
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
|
@ -24,6 +24,13 @@ for exc_data in [
|
||||||
{ORTH: "núm", NORM: "número"},
|
{ORTH: "núm", NORM: "número"},
|
||||||
{ORTH: "St.", NORM: "sant"},
|
{ORTH: "St.", NORM: "sant"},
|
||||||
{ORTH: "Sta.", NORM: "santa"},
|
{ORTH: "Sta.", NORM: "santa"},
|
||||||
|
{ORTH: "'l"},
|
||||||
|
{ORTH: "'ls"},
|
||||||
|
{ORTH: "'m"},
|
||||||
|
{ORTH: "'n"},
|
||||||
|
{ORTH: "'ns"},
|
||||||
|
{ORTH: "'s"},
|
||||||
|
{ORTH: "'t"},
|
||||||
]:
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
|
@ -72,7 +72,7 @@ steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua
|
||||||
subito successivamente successivo sue sugl sugli sui sul sull sulla sulle
|
subito successivamente successivo sue sugl sugli sui sul sull sulla sulle
|
||||||
sullo suo suoi
|
sullo suo suoi
|
||||||
|
|
||||||
tale tali talvolta tanto te tempo ti titolo torino tra tranne tre trenta
|
tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta
|
||||||
troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto
|
troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto
|
||||||
|
|
||||||
uguali ulteriore ultimo un una uno uomo
|
uguali ulteriore ultimo un una uno uomo
|
||||||
|
|
|
@ -1,8 +1,15 @@
|
||||||
|
from typing import Any, Dict, Union
|
||||||
|
from pathlib import Path
|
||||||
|
import re
|
||||||
|
import srsly
|
||||||
|
import string
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
|
from ... import util
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -40,17 +47,108 @@ class VietnameseTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
def __call__(self, text: str) -> Doc:
|
def __call__(self, text: str) -> Doc:
|
||||||
if self.use_pyvi:
|
if self.use_pyvi:
|
||||||
words, spaces = self.ViTokenizer.spacy_tokenize(text)
|
words = self.pyvi_tokenize(text)
|
||||||
|
words, spaces = util.get_words_and_spaces(words, text)
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
else:
|
else:
|
||||||
words = []
|
words, spaces = util.get_words_and_spaces(text.split(), text)
|
||||||
spaces = []
|
|
||||||
for token in self.tokenizer(text):
|
|
||||||
words.extend(list(token.text))
|
|
||||||
spaces.extend([False] * len(token.text))
|
|
||||||
spaces[-1] = bool(token.whitespace_)
|
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
|
# The methods pyvi_sylabelize_with_ws and pyvi_tokenize are adapted from
|
||||||
|
# pyvi v0.1, MIT License, Copyright (c) 2016 Viet-Trung Tran.
|
||||||
|
# See licenses/3rd_party_licenses.txt
|
||||||
|
def pyvi_sylabelize_with_ws(self, text):
|
||||||
|
"""Modified from pyvi to preserve whitespace and skip unicode
|
||||||
|
normalization."""
|
||||||
|
specials = [r"==>", r"->", r"\.\.\.", r">>"]
|
||||||
|
digit = r"\d+([\.,_]\d+)+"
|
||||||
|
email = r"([a-zA-Z0-9_.+-]+@([a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+)"
|
||||||
|
web = r"\w+://[^\s]+"
|
||||||
|
word = r"\w+"
|
||||||
|
non_word = r"[^\w\s]"
|
||||||
|
abbreviations = [
|
||||||
|
r"[A-ZĐ]+\.",
|
||||||
|
r"Tp\.",
|
||||||
|
r"Mr\.",
|
||||||
|
r"Mrs\.",
|
||||||
|
r"Ms\.",
|
||||||
|
r"Dr\.",
|
||||||
|
r"ThS\.",
|
||||||
|
]
|
||||||
|
|
||||||
|
patterns = []
|
||||||
|
patterns.extend(abbreviations)
|
||||||
|
patterns.extend(specials)
|
||||||
|
patterns.extend([web, email])
|
||||||
|
patterns.extend([digit, non_word, word])
|
||||||
|
|
||||||
|
patterns = r"(\s+|" + "|".join(patterns) + ")"
|
||||||
|
tokens = re.findall(patterns, text, re.UNICODE)
|
||||||
|
|
||||||
|
return [token[0] for token in tokens]
|
||||||
|
|
||||||
|
def pyvi_tokenize(self, text):
|
||||||
|
"""Modified from pyvi to preserve text and whitespace."""
|
||||||
|
if len(text) == 0:
|
||||||
|
return []
|
||||||
|
elif text.isspace():
|
||||||
|
return [text]
|
||||||
|
segs = self.pyvi_sylabelize_with_ws(text)
|
||||||
|
words = []
|
||||||
|
preceding_ws = []
|
||||||
|
for i, token in enumerate(segs):
|
||||||
|
if not token.isspace():
|
||||||
|
words.append(token)
|
||||||
|
preceding_ws.append(
|
||||||
|
"" if (i == 0 or not segs[i - 1].isspace()) else segs[i - 1]
|
||||||
|
)
|
||||||
|
labels = self.ViTokenizer.ViTokenizer.model.predict(
|
||||||
|
[self.ViTokenizer.ViTokenizer.sent2features(words, False)]
|
||||||
|
)
|
||||||
|
token = words[0]
|
||||||
|
tokens = []
|
||||||
|
for i in range(1, len(labels[0])):
|
||||||
|
if (
|
||||||
|
labels[0][i] == "I_W"
|
||||||
|
and words[i] not in string.punctuation
|
||||||
|
and words[i - 1] not in string.punctuation
|
||||||
|
and not words[i][0].isdigit()
|
||||||
|
and not words[i - 1][0].isdigit()
|
||||||
|
and not (words[i][0].istitle() and not words[i - 1][0].istitle())
|
||||||
|
):
|
||||||
|
token = token + preceding_ws[i] + words[i]
|
||||||
|
else:
|
||||||
|
tokens.append(token)
|
||||||
|
token = words[i]
|
||||||
|
tokens.append(token)
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
def _get_config(self) -> Dict[str, Any]:
|
||||||
|
return {"use_pyvi": self.use_pyvi}
|
||||||
|
|
||||||
|
def _set_config(self, config: Dict[str, Any] = {}) -> None:
|
||||||
|
self.use_pyvi = config.get("use_pyvi", False)
|
||||||
|
|
||||||
|
def to_bytes(self, **kwargs) -> bytes:
|
||||||
|
serializers = {"cfg": lambda: srsly.json_dumps(self._get_config())}
|
||||||
|
return util.to_bytes(serializers, [])
|
||||||
|
|
||||||
|
def from_bytes(self, data: bytes, **kwargs) -> "VietnameseTokenizer":
|
||||||
|
deserializers = {"cfg": lambda b: self._set_config(srsly.json_loads(b))}
|
||||||
|
util.from_bytes(data, deserializers, [])
|
||||||
|
return self
|
||||||
|
|
||||||
|
def to_disk(self, path: Union[str, Path], **kwargs) -> None:
|
||||||
|
path = util.ensure_path(path)
|
||||||
|
serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
|
||||||
|
return util.to_disk(path, serializers, [])
|
||||||
|
|
||||||
|
def from_disk(self, path: Union[str, Path], **kwargs) -> "VietnameseTokenizer":
|
||||||
|
path = util.ensure_path(path)
|
||||||
|
serializers = {"cfg": lambda p: self._set_config(srsly.read_json(p))}
|
||||||
|
util.from_disk(path, serializers, [])
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
class VietnameseDefaults(Language.Defaults):
|
class VietnameseDefaults(Language.Defaults):
|
||||||
config = load_config_from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
|
|
|
@ -690,7 +690,7 @@ class Language:
|
||||||
if self.vocab.vectors.shape != source.vocab.vectors.shape or \
|
if self.vocab.vectors.shape != source.vocab.vectors.shape or \
|
||||||
self.vocab.vectors.key2row != source.vocab.vectors.key2row or \
|
self.vocab.vectors.key2row != source.vocab.vectors.key2row or \
|
||||||
self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes():
|
self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes():
|
||||||
util.logger.warning(Warnings.W113.format(name=source_name))
|
warnings.warn(Warnings.W113.format(name=source_name))
|
||||||
if not source_name in source.component_names:
|
if not source_name in source.component_names:
|
||||||
raise KeyError(
|
raise KeyError(
|
||||||
Errors.E944.format(
|
Errors.E944.format(
|
||||||
|
@ -1075,6 +1075,7 @@ class Language:
|
||||||
losses: Optional[Dict[str, float]] = None,
|
losses: Optional[Dict[str, float]] = None,
|
||||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||||
exclude: Iterable[str] = SimpleFrozenList(),
|
exclude: Iterable[str] = SimpleFrozenList(),
|
||||||
|
annotates: Iterable[str] = SimpleFrozenList(),
|
||||||
):
|
):
|
||||||
"""Update the models in the pipeline.
|
"""Update the models in the pipeline.
|
||||||
|
|
||||||
|
@ -1082,10 +1083,13 @@ class Language:
|
||||||
_: Should not be set - serves to catch backwards-incompatible scripts.
|
_: Should not be set - serves to catch backwards-incompatible scripts.
|
||||||
drop (float): The dropout rate.
|
drop (float): The dropout rate.
|
||||||
sgd (Optimizer): An optimizer.
|
sgd (Optimizer): An optimizer.
|
||||||
losses (Dict[str, float]): Dictionary to update with the loss, keyed by component.
|
losses (Dict[str, float]): Dictionary to update with the loss, keyed by
|
||||||
|
component.
|
||||||
component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
|
component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
|
||||||
components, keyed by component name.
|
components, keyed by component name.
|
||||||
exclude (Iterable[str]): Names of components that shouldn't be updated.
|
exclude (Iterable[str]): Names of components that shouldn't be updated.
|
||||||
|
annotates (Iterable[str]): Names of components that should set
|
||||||
|
annotations on the predicted examples after updating.
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary
|
RETURNS (Dict[str, float]): The updated losses dictionary
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#update
|
DOCS: https://spacy.io/api/language#update
|
||||||
|
@ -1104,15 +1108,16 @@ class Language:
|
||||||
sgd = self._optimizer
|
sgd = self._optimizer
|
||||||
if component_cfg is None:
|
if component_cfg is None:
|
||||||
component_cfg = {}
|
component_cfg = {}
|
||||||
|
pipe_kwargs = {}
|
||||||
for i, (name, proc) in enumerate(self.pipeline):
|
for i, (name, proc) in enumerate(self.pipeline):
|
||||||
component_cfg.setdefault(name, {})
|
component_cfg.setdefault(name, {})
|
||||||
|
pipe_kwargs[name] = deepcopy(component_cfg[name])
|
||||||
component_cfg[name].setdefault("drop", drop)
|
component_cfg[name].setdefault("drop", drop)
|
||||||
|
pipe_kwargs[name].setdefault("batch_size", self.batch_size)
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if name in exclude or not hasattr(proc, "update"):
|
if name not in exclude and hasattr(proc, "update"):
|
||||||
continue
|
proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
|
||||||
proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
|
if sgd not in (None, False):
|
||||||
if sgd not in (None, False):
|
|
||||||
for name, proc in self.pipeline:
|
|
||||||
if (
|
if (
|
||||||
name not in exclude
|
name not in exclude
|
||||||
and hasattr(proc, "is_trainable")
|
and hasattr(proc, "is_trainable")
|
||||||
|
@ -1120,6 +1125,18 @@ class Language:
|
||||||
and proc.model not in (True, False, None)
|
and proc.model not in (True, False, None)
|
||||||
):
|
):
|
||||||
proc.finish_update(sgd)
|
proc.finish_update(sgd)
|
||||||
|
if name in annotates:
|
||||||
|
for doc, eg in zip(
|
||||||
|
_pipe(
|
||||||
|
(eg.predicted for eg in examples),
|
||||||
|
proc=proc,
|
||||||
|
name=name,
|
||||||
|
default_error_handler=self.default_error_handler,
|
||||||
|
kwargs=pipe_kwargs[name],
|
||||||
|
),
|
||||||
|
examples,
|
||||||
|
):
|
||||||
|
eg.predicted = doc
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def rehearse(
|
def rehearse(
|
||||||
|
|
|
@ -4,6 +4,7 @@ from collections import defaultdict
|
||||||
from itertools import product
|
from itertools import product
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
import warnings
|
||||||
|
|
||||||
from .matcher cimport Matcher
|
from .matcher cimport Matcher
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
|
@ -11,7 +12,6 @@ from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..tokens import Span
|
from ..tokens import Span
|
||||||
from ..util import logger
|
|
||||||
|
|
||||||
|
|
||||||
DELIMITER = "||"
|
DELIMITER = "||"
|
||||||
|
@ -282,7 +282,7 @@ cdef class DependencyMatcher:
|
||||||
keys_to_position_maps = defaultdict(lambda: defaultdict(list))
|
keys_to_position_maps = defaultdict(lambda: defaultdict(list))
|
||||||
for match_id, start, end in self._matcher(doc):
|
for match_id, start, end in self._matcher(doc):
|
||||||
if start + 1 != end:
|
if start + 1 != end:
|
||||||
logger.warning(Warnings.W110.format(tokens=[t.text for t in doc[start:end]], pattern=self._matcher.get(match_id)[1][0][0]))
|
warnings.warn(Warnings.W110.format(tokens=[t.text for t in doc[start:end]], pattern=self._matcher.get(match_id)[1][0][0]))
|
||||||
token = doc[start]
|
token = doc[start]
|
||||||
root = ([token] + list(token.ancestors))[-1]
|
root = ([token] + list(token.ancestors))[-1]
|
||||||
keys_to_position_maps[root.i][match_id].append(start)
|
keys_to_position_maps[root.i][match_id].append(start)
|
||||||
|
|
|
@ -50,6 +50,8 @@ cdef class PhraseMatcher:
|
||||||
if isinstance(attr, (int, long)):
|
if isinstance(attr, (int, long)):
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
else:
|
else:
|
||||||
|
if attr is None:
|
||||||
|
attr = "ORTH"
|
||||||
attr = attr.upper()
|
attr = attr.upper()
|
||||||
if attr == "TEXT":
|
if attr == "TEXT":
|
||||||
attr = "ORTH"
|
attr = "ORTH"
|
||||||
|
|
|
@ -1,14 +1,11 @@
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMap, PreshMapArray
|
from preshed.maps cimport PreshMap
|
||||||
from libc.stdint cimport uint64_t
|
|
||||||
from murmurhash cimport mrmr
|
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
|
from libc.stdint cimport uint64_t
|
||||||
|
|
||||||
from .structs cimport TokenC, MorphAnalysisC
|
from .structs cimport MorphAnalysisC
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
from .typedefs cimport hash_t, attr_t, flags_t
|
from .typedefs cimport attr_t, hash_t
|
||||||
from .parts_of_speech cimport univ_pos_t
|
|
||||||
from . cimport symbols
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Morphology:
|
cdef class Morphology:
|
||||||
|
@ -16,14 +13,6 @@ cdef class Morphology:
|
||||||
cdef readonly StringStore strings
|
cdef readonly StringStore strings
|
||||||
cdef PreshMap tags # Keyed by hash, value is pointer to tag
|
cdef PreshMap tags # Keyed by hash, value is pointer to tag
|
||||||
|
|
||||||
cdef public object lemmatizer
|
|
||||||
cdef readonly object tag_map
|
|
||||||
cdef readonly object tag_names
|
|
||||||
cdef readonly object reverse_index
|
|
||||||
cdef readonly object _exc
|
|
||||||
cdef readonly PreshMapArray _cache
|
|
||||||
cdef readonly int n_tags
|
|
||||||
|
|
||||||
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
|
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
|
||||||
cdef int insert(self, MorphAnalysisC tag) except -1
|
cdef int insert(self, MorphAnalysisC tag) except -1
|
||||||
|
|
||||||
|
|
|
@ -1,20 +1,11 @@
|
||||||
# cython: infer_types
|
# cython: infer_types
|
||||||
from libc.string cimport memset
|
|
||||||
|
|
||||||
import srsly
|
|
||||||
from collections import Counter
|
|
||||||
import numpy
|
import numpy
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .attrs cimport POS, IS_SPACE
|
from .attrs cimport POS
|
||||||
from .parts_of_speech cimport SPACE
|
|
||||||
from .lexeme cimport Lexeme
|
|
||||||
|
|
||||||
from .strings import get_string_id
|
|
||||||
from .attrs import LEMMA, intify_attrs
|
|
||||||
from .parts_of_speech import IDS as POS_IDS
|
from .parts_of_speech import IDS as POS_IDS
|
||||||
from .errors import Errors, Warnings
|
from .errors import Warnings
|
||||||
from .util import ensure_path
|
|
||||||
from . import symbols
|
from . import symbols
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -481,7 +481,8 @@ class EntityLinker(TrainablePipe):
|
||||||
|
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
try:
|
try:
|
||||||
self.model.from_bytes(p.open("rb").read())
|
with p.open("rb") as infile:
|
||||||
|
self.model.from_bytes(infile.read())
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
raise ValueError(Errors.E149) from None
|
raise ValueError(Errors.E149) from None
|
||||||
|
|
||||||
|
|
|
@ -102,17 +102,12 @@ class EntityRuler(Pipe):
|
||||||
self.overwrite = overwrite_ents
|
self.overwrite = overwrite_ents
|
||||||
self.token_patterns = defaultdict(list)
|
self.token_patterns = defaultdict(list)
|
||||||
self.phrase_patterns = defaultdict(list)
|
self.phrase_patterns = defaultdict(list)
|
||||||
|
self._validate = validate
|
||||||
self.matcher = Matcher(nlp.vocab, validate=validate)
|
self.matcher = Matcher(nlp.vocab, validate=validate)
|
||||||
if phrase_matcher_attr is not None:
|
self.phrase_matcher_attr = phrase_matcher_attr
|
||||||
if phrase_matcher_attr.upper() == "TEXT":
|
self.phrase_matcher = PhraseMatcher(
|
||||||
phrase_matcher_attr = "ORTH"
|
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
|
||||||
self.phrase_matcher_attr = phrase_matcher_attr
|
)
|
||||||
self.phrase_matcher = PhraseMatcher(
|
|
||||||
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.phrase_matcher_attr = None
|
|
||||||
self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
|
|
||||||
self.ent_id_sep = ent_id_sep
|
self.ent_id_sep = ent_id_sep
|
||||||
self._ent_ids = defaultdict(dict)
|
self._ent_ids = defaultdict(dict)
|
||||||
if patterns is not None:
|
if patterns is not None:
|
||||||
|
@ -317,20 +312,27 @@ class EntityRuler(Pipe):
|
||||||
pattern = entry["pattern"]
|
pattern = entry["pattern"]
|
||||||
if isinstance(pattern, Doc):
|
if isinstance(pattern, Doc):
|
||||||
self.phrase_patterns[label].append(pattern)
|
self.phrase_patterns[label].append(pattern)
|
||||||
|
self.phrase_matcher.add(label, [pattern])
|
||||||
elif isinstance(pattern, list):
|
elif isinstance(pattern, list):
|
||||||
self.token_patterns[label].append(pattern)
|
self.token_patterns[label].append(pattern)
|
||||||
|
self.matcher.add(label, [pattern])
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E097.format(pattern=pattern))
|
raise ValueError(Errors.E097.format(pattern=pattern))
|
||||||
for label, patterns in self.token_patterns.items():
|
|
||||||
self.matcher.add(label, patterns)
|
|
||||||
for label, patterns in self.phrase_patterns.items():
|
|
||||||
self.phrase_matcher.add(label, patterns)
|
|
||||||
|
|
||||||
def clear(self) -> None:
|
def clear(self) -> None:
|
||||||
"""Reset all patterns."""
|
"""Reset all patterns."""
|
||||||
self.token_patterns = defaultdict(list)
|
self.token_patterns = defaultdict(list)
|
||||||
self.phrase_patterns = defaultdict(list)
|
self.phrase_patterns = defaultdict(list)
|
||||||
self._ent_ids = defaultdict(dict)
|
self._ent_ids = defaultdict(dict)
|
||||||
|
self.matcher = Matcher(self.nlp.vocab, validate=self._validate)
|
||||||
|
self.phrase_matcher = PhraseMatcher(
|
||||||
|
self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
|
||||||
|
)
|
||||||
|
|
||||||
|
def _require_patterns(self) -> None:
|
||||||
|
"""Raise a warning if this component has no patterns defined."""
|
||||||
|
if len(self) == 0:
|
||||||
|
warnings.warn(Warnings.W036.format(name=self.name))
|
||||||
|
|
||||||
def _require_patterns(self) -> None:
|
def _require_patterns(self) -> None:
|
||||||
"""Raise a warning if this component has no patterns defined."""
|
"""Raise a warning if this component has no patterns defined."""
|
||||||
|
@ -381,10 +383,9 @@ class EntityRuler(Pipe):
|
||||||
self.add_patterns(cfg.get("patterns", cfg))
|
self.add_patterns(cfg.get("patterns", cfg))
|
||||||
self.overwrite = cfg.get("overwrite", False)
|
self.overwrite = cfg.get("overwrite", False)
|
||||||
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
|
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
|
||||||
if self.phrase_matcher_attr is not None:
|
self.phrase_matcher = PhraseMatcher(
|
||||||
self.phrase_matcher = PhraseMatcher(
|
self.nlp.vocab, attr=self.phrase_matcher_attr
|
||||||
self.nlp.vocab, attr=self.phrase_matcher_attr
|
)
|
||||||
)
|
|
||||||
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
||||||
else:
|
else:
|
||||||
self.add_patterns(cfg)
|
self.add_patterns(cfg)
|
||||||
|
@ -435,10 +436,9 @@ class EntityRuler(Pipe):
|
||||||
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
|
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
|
||||||
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
||||||
|
|
||||||
if self.phrase_matcher_attr is not None:
|
self.phrase_matcher = PhraseMatcher(
|
||||||
self.phrase_matcher = PhraseMatcher(
|
self.nlp.vocab, attr=self.phrase_matcher_attr
|
||||||
self.nlp.vocab, attr=self.phrase_matcher_attr
|
)
|
||||||
)
|
|
||||||
from_disk(path, deserializers_patterns, {})
|
from_disk(path, deserializers_patterns, {})
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,8 @@ from typing import Optional, List, Dict, Any, Callable, Iterable, Union, Tuple
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
@ -182,7 +184,7 @@ class Lemmatizer(Pipe):
|
||||||
univ_pos = token.pos_.lower()
|
univ_pos = token.pos_.lower()
|
||||||
if univ_pos in ("", "eol", "space"):
|
if univ_pos in ("", "eol", "space"):
|
||||||
if univ_pos == "":
|
if univ_pos == "":
|
||||||
logger.warning(Warnings.W108.format(text=string))
|
warnings.warn(Warnings.W108.format(text=string))
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
# See Issue #435 for example of where this logic is requied.
|
# See Issue #435 for example of where this logic is requied.
|
||||||
if self.is_base_form(token):
|
if self.is_base_form(token):
|
||||||
|
|
|
@ -324,7 +324,8 @@ cdef class TrainablePipe(Pipe):
|
||||||
|
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
try:
|
try:
|
||||||
self.model.from_bytes(p.open("rb").read())
|
with open(p, "rb") as mfile:
|
||||||
|
self.model.from_bytes(mfile.read())
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
raise ValueError(Errors.E149) from None
|
raise ValueError(Errors.E149) from None
|
||||||
|
|
||||||
|
|
|
@ -313,6 +313,7 @@ class ConfigSchemaTraining(BaseModel):
|
||||||
optimizer: Optimizer = Field(..., title="The optimizer to use")
|
optimizer: Optimizer = Field(..., title="The optimizer to use")
|
||||||
logger: Logger = Field(..., title="The logger to track training progress")
|
logger: Logger = Field(..., title="The logger to track training progress")
|
||||||
frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
|
frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
|
||||||
|
annotating_components: List[str] = Field(..., title="Pipeline components that should set annotations during training")
|
||||||
before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk")
|
before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
|
@ -293,6 +293,12 @@ def ur_tokenizer():
|
||||||
return get_lang_class("ur")().tokenizer
|
return get_lang_class("ur")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def vi_tokenizer():
|
||||||
|
pytest.importorskip("pyvi")
|
||||||
|
return get_lang_class("vi")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def yo_tokenizer():
|
def yo_tokenizer():
|
||||||
return get_lang_class("yo")().tokenizer
|
return get_lang_class("yo")().tokenizer
|
||||||
|
|
|
@ -2,8 +2,6 @@ import weakref
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import numpy
|
import numpy
|
||||||
import logging
|
|
||||||
import mock
|
|
||||||
|
|
||||||
from spacy.lang.xx import MultiLanguage
|
from spacy.lang.xx import MultiLanguage
|
||||||
from spacy.tokens import Doc, Span, Token
|
from spacy.tokens import Doc, Span, Token
|
||||||
|
@ -158,13 +156,10 @@ def test_doc_api_serialize(en_tokenizer, text):
|
||||||
def inner_func(d1, d2):
|
def inner_func(d1, d2):
|
||||||
return "hello!"
|
return "hello!"
|
||||||
|
|
||||||
logger = logging.getLogger("spacy")
|
_ = tokens.to_bytes() # noqa: F841
|
||||||
with mock.patch.object(logger, "warning") as mock_warning:
|
with pytest.warns(UserWarning):
|
||||||
_ = tokens.to_bytes() # noqa: F841
|
|
||||||
mock_warning.assert_not_called()
|
|
||||||
tokens.user_hooks["similarity"] = inner_func
|
tokens.user_hooks["similarity"] = inner_func
|
||||||
_ = tokens.to_bytes() # noqa: F841
|
_ = tokens.to_bytes() # noqa: F841
|
||||||
mock_warning.assert_called_once()
|
|
||||||
|
|
||||||
|
|
||||||
def test_doc_api_set_ents(en_tokenizer):
|
def test_doc_api_set_ents(en_tokenizer):
|
||||||
|
|
|
@ -12,13 +12,13 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
|
||||||
una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""
|
una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""
|
||||||
|
|
||||||
tokens = ca_tokenizer(text)
|
tokens = ca_tokenizer(text)
|
||||||
assert len(tokens) == 138
|
assert len(tokens) == 140
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,length",
|
"text,length",
|
||||||
[
|
[
|
||||||
("Perquè va anar-hi?", 6),
|
("Perquè va anar-hi?", 4),
|
||||||
("“Ah no?”", 5),
|
("“Ah no?”", 5),
|
||||||
("""Sí! "Anem", va contestar el Joan Carles""", 11),
|
("""Sí! "Anem", va contestar el Joan Carles""", 11),
|
||||||
("Van córrer aprox. 10km", 5),
|
("Van córrer aprox. 10km", 5),
|
||||||
|
|
|
@ -8,7 +8,7 @@ from spacy.util import get_lang_class
|
||||||
# Only include languages with no external dependencies
|
# Only include languages with no external dependencies
|
||||||
# excluded: ru, uk
|
# excluded: ru, uk
|
||||||
# excluded for custom tables: es, pl
|
# excluded for custom tables: es, pl
|
||||||
LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"]
|
LANGUAGES = ["bn", "ca", "el", "en", "fa", "fr", "nb", "nl", "sv"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
|
0
spacy/tests/lang/vi/__init__.py
Normal file
0
spacy/tests/lang/vi/__init__.py
Normal file
33
spacy/tests/lang/vi/test_serialize.py
Normal file
33
spacy/tests/lang/vi/test_serialize.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
from spacy.lang.vi import Vietnamese
|
||||||
|
from ...util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
def test_vi_tokenizer_serialize(vi_tokenizer):
|
||||||
|
tokenizer_bytes = vi_tokenizer.to_bytes()
|
||||||
|
nlp = Vietnamese()
|
||||||
|
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
||||||
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||||
|
assert nlp.tokenizer.use_pyvi is True
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
file_path = d / "tokenizer"
|
||||||
|
vi_tokenizer.to_disk(file_path)
|
||||||
|
nlp = Vietnamese()
|
||||||
|
nlp.tokenizer.from_disk(file_path)
|
||||||
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||||
|
assert nlp.tokenizer.use_pyvi is True
|
||||||
|
|
||||||
|
# mode is (de)serialized correctly
|
||||||
|
nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
|
||||||
|
nlp_bytes = nlp.to_bytes()
|
||||||
|
nlp_r = Vietnamese()
|
||||||
|
nlp_r.from_bytes(nlp_bytes)
|
||||||
|
assert nlp_bytes == nlp_r.to_bytes()
|
||||||
|
assert nlp_r.tokenizer.use_pyvi == False
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
nlp.to_disk(d)
|
||||||
|
nlp_r = Vietnamese()
|
||||||
|
nlp_r.from_disk(d)
|
||||||
|
assert nlp_bytes == nlp_r.to_bytes()
|
||||||
|
assert nlp_r.tokenizer.use_pyvi == False
|
47
spacy/tests/lang/vi/test_tokenizer.py
Normal file
47
spacy/tests/lang/vi/test_tokenizer.py
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
|
||||||
|
from spacy.lang.vi import Vietnamese
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
TOKENIZER_TESTS = [
|
||||||
|
("Đây là một văn bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này", ['Đây', 'là', 'một', 'văn bản', 'bằng', 'tiếng', 'Việt', 'Sau', 'đó', ',', 'đây', 'là', 'một', 'văn bản', 'khác', 'bằng', 'ngôn ngữ', 'này']),
|
||||||
|
]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
||||||
|
def test_vi_tokenizer(vi_tokenizer, text, expected_tokens):
|
||||||
|
tokens = [token.text for token in vi_tokenizer(text)]
|
||||||
|
assert tokens == expected_tokens
|
||||||
|
|
||||||
|
|
||||||
|
def test_vi_tokenizer_extra_spaces(vi_tokenizer):
|
||||||
|
# note: three spaces after "I"
|
||||||
|
tokens = vi_tokenizer("I like cheese.")
|
||||||
|
assert tokens[1].orth_ == " "
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text", NAUGHTY_STRINGS)
|
||||||
|
def test_vi_tokenizer_naughty_strings(vi_tokenizer, text):
|
||||||
|
tokens = vi_tokenizer(text)
|
||||||
|
assert tokens.text_with_ws == text
|
||||||
|
|
||||||
|
|
||||||
|
def test_vi_tokenizer_emptyish_texts(vi_tokenizer):
|
||||||
|
doc = vi_tokenizer("")
|
||||||
|
assert len(doc) == 0
|
||||||
|
doc = vi_tokenizer(" ")
|
||||||
|
assert len(doc) == 1
|
||||||
|
doc = vi_tokenizer("\n\n\n \t\t \n\n\n")
|
||||||
|
assert len(doc) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_vi_tokenizer_no_pyvi():
|
||||||
|
"""Test for whitespace tokenization without pyvi"""
|
||||||
|
nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
|
||||||
|
text = "Đây là một văn bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này"
|
||||||
|
doc = nlp(text)
|
||||||
|
assert [t.text for t in doc if not t.is_space] == text.split()
|
||||||
|
assert doc[4].text == " "
|
|
@ -252,12 +252,12 @@ def test_ruler_before_ner():
|
||||||
# 1 : Entity Ruler - should set "this" to B and everything else to empty
|
# 1 : Entity Ruler - should set "this" to B and everything else to empty
|
||||||
patterns = [{"label": "THING", "pattern": "This"}]
|
patterns = [{"label": "THING", "pattern": "This"}]
|
||||||
ruler = nlp.add_pipe("entity_ruler")
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
ruler.add_patterns(patterns)
|
|
||||||
|
|
||||||
# 2: untrained NER - should set everything else to O
|
# 2: untrained NER - should set everything else to O
|
||||||
untrained_ner = nlp.add_pipe("ner")
|
untrained_ner = nlp.add_pipe("ner")
|
||||||
untrained_ner.add_label("MY_LABEL")
|
untrained_ner.add_label("MY_LABEL")
|
||||||
nlp.initialize()
|
nlp.initialize()
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
doc = nlp("This is Antti Korhonen speaking in Finland")
|
doc = nlp("This is Antti Korhonen speaking in Finland")
|
||||||
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
|
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
|
||||||
expected_types = ["THING", "", "", "", "", "", ""]
|
expected_types = ["THING", "", "", "", "", "", ""]
|
||||||
|
|
113
spacy/tests/pipeline/test_annotates_on_update.py
Normal file
113
spacy/tests/pipeline/test_annotates_on_update.py
Normal file
|
@ -0,0 +1,113 @@
|
||||||
|
from typing import Callable, Iterable, Iterator
|
||||||
|
import pytest
|
||||||
|
import io
|
||||||
|
|
||||||
|
from thinc.api import Config
|
||||||
|
from spacy.language import Language
|
||||||
|
from spacy.training import Example
|
||||||
|
from spacy.training.loop import train
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.util import registry, load_model_from_config
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def config_str():
|
||||||
|
return """
|
||||||
|
[nlp]
|
||||||
|
lang = "en"
|
||||||
|
pipeline = ["sentencizer","assert_sents"]
|
||||||
|
disabled = []
|
||||||
|
before_creation = null
|
||||||
|
after_creation = null
|
||||||
|
after_pipeline_creation = null
|
||||||
|
batch_size = 1000
|
||||||
|
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[components.assert_sents]
|
||||||
|
factory = "assert_sents"
|
||||||
|
|
||||||
|
[components.sentencizer]
|
||||||
|
factory = "sentencizer"
|
||||||
|
punct_chars = null
|
||||||
|
|
||||||
|
[training]
|
||||||
|
dev_corpus = "corpora.dev"
|
||||||
|
train_corpus = "corpora.train"
|
||||||
|
annotating_components = ["sentencizer"]
|
||||||
|
max_steps = 2
|
||||||
|
|
||||||
|
[corpora]
|
||||||
|
|
||||||
|
[corpora.dev]
|
||||||
|
@readers = "unannotated_corpus"
|
||||||
|
|
||||||
|
[corpora.train]
|
||||||
|
@readers = "unannotated_corpus"
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def test_annotates_on_update():
|
||||||
|
# The custom component checks for sentence annotation
|
||||||
|
@Language.factory("assert_sents", default_config={})
|
||||||
|
def assert_sents(nlp, name):
|
||||||
|
return AssertSents(name)
|
||||||
|
|
||||||
|
class AssertSents:
|
||||||
|
def __init__(self, name, **cfg):
|
||||||
|
self.name = name
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
|
if not doc.has_annotation("SENT_START"):
|
||||||
|
raise ValueError("No sents")
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def update(self, examples, *, drop=0.0, sgd=None, losses=None):
|
||||||
|
for example in examples:
|
||||||
|
if not example.predicted.has_annotation("SENT_START"):
|
||||||
|
raise ValueError("No sents")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
nlp = English()
|
||||||
|
nlp.add_pipe("sentencizer")
|
||||||
|
nlp.add_pipe("assert_sents")
|
||||||
|
|
||||||
|
# When the pipeline runs, annotations are set
|
||||||
|
doc = nlp("This is a sentence.")
|
||||||
|
|
||||||
|
examples = []
|
||||||
|
for text in ["a a", "b b", "c c"]:
|
||||||
|
examples.append(Example(nlp.make_doc(text), nlp(text)))
|
||||||
|
|
||||||
|
for example in examples:
|
||||||
|
assert not example.predicted.has_annotation("SENT_START")
|
||||||
|
|
||||||
|
# If updating without setting annotations, assert_sents will raise an error
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp.update(examples)
|
||||||
|
|
||||||
|
# Updating while setting annotations for the sentencizer succeeds
|
||||||
|
nlp.update(examples, annotates=["sentencizer"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_annotating_components_from_config(config_str):
|
||||||
|
@registry.readers("unannotated_corpus")
|
||||||
|
def create_unannotated_corpus() -> Callable[[Language], Iterable[Example]]:
|
||||||
|
return UnannotatedCorpus()
|
||||||
|
|
||||||
|
class UnannotatedCorpus:
|
||||||
|
def __call__(self, nlp: Language) -> Iterator[Example]:
|
||||||
|
for text in ["a a", "b b", "c c"]:
|
||||||
|
doc = nlp.make_doc(text)
|
||||||
|
yield Example(doc, doc)
|
||||||
|
|
||||||
|
orig_config = Config().from_str(config_str)
|
||||||
|
nlp = load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||||
|
assert nlp.config["training"]["annotating_components"] == ["sentencizer"]
|
||||||
|
train(nlp)
|
||||||
|
|
||||||
|
nlp.config["training"]["annotating_components"] = []
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
train(nlp)
|
|
@ -89,6 +89,19 @@ def test_entity_ruler_init_clear(nlp, patterns):
|
||||||
assert len(ruler.labels) == 0
|
assert len(ruler.labels) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_entity_ruler_clear(nlp, patterns):
|
||||||
|
"""Test that initialization clears patterns."""
|
||||||
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
assert len(ruler.labels) == 4
|
||||||
|
doc = nlp("hello world")
|
||||||
|
assert len(doc.ents) == 1
|
||||||
|
ruler.clear()
|
||||||
|
assert len(ruler.labels) == 0
|
||||||
|
doc = nlp("hello world")
|
||||||
|
assert len(doc.ents) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_entity_ruler_existing(nlp, patterns):
|
def test_entity_ruler_existing(nlp, patterns):
|
||||||
ruler = nlp.add_pipe("entity_ruler")
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
|
|
|
@ -1,6 +1,4 @@
|
||||||
import pytest
|
import pytest
|
||||||
import logging
|
|
||||||
import mock
|
|
||||||
import pickle
|
import pickle
|
||||||
from spacy import util, registry
|
from spacy import util, registry
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
@ -59,10 +57,10 @@ def test_lemmatizer_config(nlp):
|
||||||
|
|
||||||
# warning if no POS assigned
|
# warning if no POS assigned
|
||||||
doc = nlp.make_doc("coping")
|
doc = nlp.make_doc("coping")
|
||||||
logger = logging.getLogger("spacy")
|
with pytest.warns(UserWarning):
|
||||||
with mock.patch.object(logger, "warning") as mock_warning:
|
|
||||||
doc = lemmatizer(doc)
|
doc = lemmatizer(doc)
|
||||||
mock_warning.assert_called_once()
|
# warns once by default
|
||||||
|
doc = lemmatizer(doc)
|
||||||
|
|
||||||
# works with POS
|
# works with POS
|
||||||
doc = nlp.make_doc("coping")
|
doc = nlp.make_doc("coping")
|
||||||
|
|
|
@ -1,6 +1,4 @@
|
||||||
import pytest
|
import pytest
|
||||||
import mock
|
|
||||||
import logging
|
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.de import German
|
from spacy.lang.de import German
|
||||||
|
@ -334,24 +332,31 @@ def test_language_factories_invalid():
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"weights,expected",
|
"weights,override,expected",
|
||||||
[
|
[
|
||||||
([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {"a": 0.33, "b": 0.33, "c": 0.33}),
|
([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {}, {"a": 0.33, "b": 0.33, "c": 0.33}),
|
||||||
([{"a": 1.0}, {"b": 50}, {"c": 123}], {"a": 0.33, "b": 0.33, "c": 0.33}),
|
([{"a": 1.0}, {"b": 50}, {"c": 100}], {}, {"a": 0.01, "b": 0.33, "c": 0.66}),
|
||||||
(
|
(
|
||||||
[{"a": 0.7, "b": 0.3}, {"c": 1.0}, {"d": 0.5, "e": 0.5}],
|
[{"a": 0.7, "b": 0.3}, {"c": 1.0}, {"d": 0.5, "e": 0.5}],
|
||||||
|
{},
|
||||||
{"a": 0.23, "b": 0.1, "c": 0.33, "d": 0.17, "e": 0.17},
|
{"a": 0.23, "b": 0.1, "c": 0.33, "d": 0.17, "e": 0.17},
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
[{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
|
[{"a": 100, "b": 300}, {"c": 50, "d": 50}],
|
||||||
{"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
|
{},
|
||||||
|
{"a": 0.2, "b": 0.6, "c": 0.1, "d": 0.1},
|
||||||
),
|
),
|
||||||
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75}),
|
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {}, {"a": 0.33, "b": 0.67}),
|
||||||
([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"a": 0.0, "b": 0.0, "c": 0.0}),
|
([{"a": 0.5, "b": 0.0}], {}, {"a": 1.0, "b": 0.0}),
|
||||||
|
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.0}, {"a": 0.0, "b": 1.0}),
|
||||||
|
([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {}, {"a": 0.0, "b": 0.0, "c": 0.0}),
|
||||||
|
([{"a": 0.0, "b": 0.0}, {"c": 1.0}], {}, {"a": 0.0, "b": 0.0, "c": 1.0}),
|
||||||
|
([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"c": 0.2}, {"a": 0.0, "b": 0.0, "c": 1.0}),
|
||||||
|
([{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], {"a": 0.0, "b": 0.0}, {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5}),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_language_factories_combine_score_weights(weights, expected):
|
def test_language_factories_combine_score_weights(weights, override, expected):
|
||||||
result = combine_score_weights(weights)
|
result = combine_score_weights(weights, override)
|
||||||
assert sum(result.values()) in (0.99, 1.0, 0.0)
|
assert sum(result.values()) in (0.99, 1.0, 0.0)
|
||||||
assert result == expected
|
assert result == expected
|
||||||
|
|
||||||
|
@ -377,17 +382,17 @@ def test_language_factories_scores():
|
||||||
# Test with custom defaults
|
# Test with custom defaults
|
||||||
config = nlp.config.copy()
|
config = nlp.config.copy()
|
||||||
config["training"]["score_weights"]["a1"] = 0.0
|
config["training"]["score_weights"]["a1"] = 0.0
|
||||||
config["training"]["score_weights"]["b3"] = 1.0
|
config["training"]["score_weights"]["b3"] = 1.3
|
||||||
nlp = English.from_config(config)
|
nlp = English.from_config(config)
|
||||||
score_weights = nlp.config["training"]["score_weights"]
|
score_weights = nlp.config["training"]["score_weights"]
|
||||||
expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34}
|
expected = {"a1": 0.0, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.65}
|
||||||
assert score_weights == expected
|
assert score_weights == expected
|
||||||
# Test with null values
|
# Test with null values
|
||||||
config = nlp.config.copy()
|
config = nlp.config.copy()
|
||||||
config["training"]["score_weights"]["a1"] = None
|
config["training"]["score_weights"]["a1"] = None
|
||||||
nlp = English.from_config(config)
|
nlp = English.from_config(config)
|
||||||
score_weights = nlp.config["training"]["score_weights"]
|
score_weights = nlp.config["training"]["score_weights"]
|
||||||
expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35}
|
expected = {"a1": None, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.66}
|
||||||
assert score_weights == expected
|
assert score_weights == expected
|
||||||
|
|
||||||
|
|
||||||
|
@ -430,10 +435,8 @@ def test_pipe_factories_from_source_language_subclass():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.vocab.vectors.resize((1, 4))
|
nlp.vocab.vectors.resize((1, 4))
|
||||||
nlp.vocab.vectors.add("cat", vector=[1, 2, 3, 4])
|
nlp.vocab.vectors.add("cat", vector=[1, 2, 3, 4])
|
||||||
logger = logging.getLogger("spacy")
|
with pytest.warns(UserWarning):
|
||||||
with mock.patch.object(logger, "warning") as mock_warning:
|
|
||||||
nlp.add_pipe("tagger", source=source_nlp)
|
nlp.add_pipe("tagger", source=source_nlp)
|
||||||
mock_warning.assert_called()
|
|
||||||
|
|
||||||
|
|
||||||
def test_pipe_factories_from_source_custom():
|
def test_pipe_factories_from_source_custom():
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.pipeline import TrainablePipe
|
from spacy.pipeline import TrainablePipe
|
||||||
|
from spacy.training import Example
|
||||||
from spacy.util import SimpleFrozenList, get_arg_names
|
from spacy.util import SimpleFrozenList, get_arg_names
|
||||||
|
from spacy.lang.en import English
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -417,3 +419,41 @@ def test_pipe_methods_initialize():
|
||||||
assert "test" in nlp.config["initialize"]["components"]
|
assert "test" in nlp.config["initialize"]["components"]
|
||||||
nlp.remove_pipe("test")
|
nlp.remove_pipe("test")
|
||||||
assert "test" not in nlp.config["initialize"]["components"]
|
assert "test" not in nlp.config["initialize"]["components"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_with_annotates():
|
||||||
|
name = "test_with_annotates"
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
def make_component(name):
|
||||||
|
results[name] = ""
|
||||||
|
|
||||||
|
def component(doc):
|
||||||
|
nonlocal results
|
||||||
|
results[name] += doc.text
|
||||||
|
return doc
|
||||||
|
|
||||||
|
return component
|
||||||
|
|
||||||
|
c1 = Language.component(f"{name}1", func=make_component(f"{name}1"))
|
||||||
|
c2 = Language.component(f"{name}2", func=make_component(f"{name}2"))
|
||||||
|
|
||||||
|
components = set([f"{name}1", f"{name}2"])
|
||||||
|
|
||||||
|
nlp = English()
|
||||||
|
texts = ["a", "bb", "ccc"]
|
||||||
|
examples = []
|
||||||
|
for text in texts:
|
||||||
|
examples.append(Example(nlp.make_doc(text), nlp.make_doc(text)))
|
||||||
|
|
||||||
|
for components_to_annotate in [[], [f"{name}1"], [f"{name}1", f"{name}2"], [f"{name}2", f"{name}1"]]:
|
||||||
|
for key in results:
|
||||||
|
results[key] = ""
|
||||||
|
nlp = English(vocab=nlp.vocab)
|
||||||
|
nlp.add_pipe(f"{name}1")
|
||||||
|
nlp.add_pipe(f"{name}2")
|
||||||
|
nlp.update(examples, annotates=components_to_annotate)
|
||||||
|
for component in components_to_annotate:
|
||||||
|
assert results[component] == "".join(eg.predicted.text for eg in examples)
|
||||||
|
for component in components - set(components_to_annotate):
|
||||||
|
assert results[component] == ""
|
||||||
|
|
34
spacy/tests/regression/test_issue8216.py
Normal file
34
spacy/tests/regression/test_issue8216.py
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from spacy import registry
|
||||||
|
from spacy.language import Language
|
||||||
|
from spacy.pipeline import EntityRuler
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def nlp():
|
||||||
|
return Language()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
@registry.misc("entity_ruler_patterns")
|
||||||
|
def patterns():
|
||||||
|
return [
|
||||||
|
{"label": "HELLO", "pattern": "hello world"},
|
||||||
|
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||||
|
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||||
|
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||||
|
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||||
|
{"label": "TECH_ORG", "pattern": "Microsoft", "id": "a2"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_entity_ruler_fix8216(nlp, patterns):
|
||||||
|
"""Test that patterns don't get added excessively."""
|
||||||
|
ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
|
||||||
|
assert pattern_count > 0
|
||||||
|
ruler.add_patterns([])
|
||||||
|
after_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
|
||||||
|
assert after_count == pattern_count
|
|
@ -84,7 +84,8 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n
|
||||||
@pytest.mark.parametrize("file_name", ["sun.txt"])
|
@pytest.mark.parametrize("file_name", ["sun.txt"])
|
||||||
def test_tokenizer_handle_text_from_file(tokenizer, file_name):
|
def test_tokenizer_handle_text_from_file(tokenizer, file_name):
|
||||||
loc = ensure_path(__file__).parent / file_name
|
loc = ensure_path(__file__).parent / file_name
|
||||||
text = loc.open("r", encoding="utf8").read()
|
with loc.open("r", encoding="utf8") as infile:
|
||||||
|
text = infile.read()
|
||||||
assert len(text) != 0
|
assert len(text) != 0
|
||||||
tokens = tokenizer(text)
|
tokens = tokenizer(text)
|
||||||
assert len(tokens) > 100
|
assert len(tokens) > 100
|
||||||
|
|
|
@ -14,7 +14,7 @@ cdef class Tokenizer:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef PreshMap _cache
|
cdef PreshMap _cache
|
||||||
cdef PreshMap _specials
|
cdef PreshMap _specials
|
||||||
cpdef readonly Vocab vocab
|
cdef readonly Vocab vocab
|
||||||
|
|
||||||
cdef object _token_match
|
cdef object _token_match
|
||||||
cdef object _url_match
|
cdef object _url_match
|
||||||
|
|
|
@ -1321,7 +1321,7 @@ cdef class Doc:
|
||||||
if "user_data_values" not in exclude:
|
if "user_data_values" not in exclude:
|
||||||
serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
|
serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
|
||||||
if "user_hooks" not in exclude and any((self.user_hooks, self.user_token_hooks, self.user_span_hooks)):
|
if "user_hooks" not in exclude and any((self.user_hooks, self.user_token_hooks, self.user_span_hooks)):
|
||||||
util.logger.warning(Warnings.W109)
|
warnings.warn(Warnings.W109)
|
||||||
return util.to_dict(serializers, exclude)
|
return util.to_dict(serializers, exclude)
|
||||||
|
|
||||||
def from_dict(self, msg, *, exclude=tuple()):
|
def from_dict(self, msg, *, exclude=tuple()):
|
||||||
|
|
|
@ -74,6 +74,8 @@ def train(
|
||||||
|
|
||||||
# Components that shouldn't be updated during training
|
# Components that shouldn't be updated during training
|
||||||
frozen_components = T["frozen_components"]
|
frozen_components = T["frozen_components"]
|
||||||
|
# Components that should set annotations on update
|
||||||
|
annotating_components = T["annotating_components"]
|
||||||
# Create iterator, which yields out info after each optimization step.
|
# Create iterator, which yields out info after each optimization step.
|
||||||
training_step_iterator = train_while_improving(
|
training_step_iterator = train_while_improving(
|
||||||
nlp,
|
nlp,
|
||||||
|
@ -86,11 +88,17 @@ def train(
|
||||||
max_steps=T["max_steps"],
|
max_steps=T["max_steps"],
|
||||||
eval_frequency=T["eval_frequency"],
|
eval_frequency=T["eval_frequency"],
|
||||||
exclude=frozen_components,
|
exclude=frozen_components,
|
||||||
|
annotating_components=annotating_components,
|
||||||
)
|
)
|
||||||
clean_output_dir(output_path)
|
clean_output_dir(output_path)
|
||||||
stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n")
|
stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n")
|
||||||
if frozen_components:
|
if frozen_components:
|
||||||
stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
|
stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
|
||||||
|
if annotating_components:
|
||||||
|
stdout.write(
|
||||||
|
msg.info(f"Set annotations on update for: {annotating_components}")
|
||||||
|
+ "\n"
|
||||||
|
)
|
||||||
stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n")
|
stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n")
|
||||||
with nlp.select_pipes(disable=frozen_components):
|
with nlp.select_pipes(disable=frozen_components):
|
||||||
log_step, finalize_logger = train_logger(nlp, stdout, stderr)
|
log_step, finalize_logger = train_logger(nlp, stdout, stderr)
|
||||||
|
@ -142,6 +150,7 @@ def train_while_improving(
|
||||||
patience: int,
|
patience: int,
|
||||||
max_steps: int,
|
max_steps: int,
|
||||||
exclude: List[str],
|
exclude: List[str],
|
||||||
|
annotating_components: List[str],
|
||||||
):
|
):
|
||||||
"""Train until an evaluation stops improving. Works as a generator,
|
"""Train until an evaluation stops improving. Works as a generator,
|
||||||
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
|
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
|
||||||
|
@ -193,7 +202,12 @@ def train_while_improving(
|
||||||
dropout = next(dropouts)
|
dropout = next(dropouts)
|
||||||
for subbatch in subdivide_batch(batch, accumulate_gradient):
|
for subbatch in subdivide_batch(batch, accumulate_gradient):
|
||||||
nlp.update(
|
nlp.update(
|
||||||
subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
|
subbatch,
|
||||||
|
drop=dropout,
|
||||||
|
losses=losses,
|
||||||
|
sgd=False,
|
||||||
|
exclude=exclude,
|
||||||
|
annotates=annotating_components,
|
||||||
)
|
)
|
||||||
# TODO: refactor this so we don't have to run it separately in here
|
# TODO: refactor this so we don't have to run it separately in here
|
||||||
for name, proc in nlp.pipeline:
|
for name, proc in nlp.pipeline:
|
||||||
|
|
|
@ -1370,32 +1370,14 @@ def combine_score_weights(
|
||||||
should be preserved.
|
should be preserved.
|
||||||
RETURNS (Dict[str, float]): The combined and normalized weights.
|
RETURNS (Dict[str, float]): The combined and normalized weights.
|
||||||
"""
|
"""
|
||||||
|
# We divide each weight by the total weight sum.
|
||||||
# We first need to extract all None/null values for score weights that
|
# We first need to extract all None/null values for score weights that
|
||||||
# shouldn't be shown in the table *or* be weighted
|
# shouldn't be shown in the table *or* be weighted
|
||||||
result = {}
|
result = {key: overrides.get(key, value) for w_dict in weights for (key, value) in w_dict.items()}
|
||||||
all_weights = []
|
weight_sum = sum([v if v else 0.0 for v in result.values()])
|
||||||
for w_dict in weights:
|
for key, value in result.items():
|
||||||
filtered_weights = {}
|
if value and weight_sum > 0:
|
||||||
for key, value in w_dict.items():
|
result[key] = round(value / weight_sum, 2)
|
||||||
value = overrides.get(key, value)
|
|
||||||
if value is None:
|
|
||||||
result[key] = None
|
|
||||||
else:
|
|
||||||
filtered_weights[key] = value
|
|
||||||
all_weights.append(filtered_weights)
|
|
||||||
for w_dict in all_weights:
|
|
||||||
# We need to account for weights that don't sum to 1.0 and normalize
|
|
||||||
# the score weights accordingly, then divide score by the number of
|
|
||||||
# components.
|
|
||||||
total = sum(w_dict.values())
|
|
||||||
for key, value in w_dict.items():
|
|
||||||
if total == 0:
|
|
||||||
weight = 0.0
|
|
||||||
else:
|
|
||||||
weight = round(value / total / len(all_weights), 2)
|
|
||||||
prev_weight = result.get(key, 0.0)
|
|
||||||
prev_weight = 0.0 if prev_weight is None else prev_weight
|
|
||||||
result[key] = prev_weight + weight
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -25,12 +25,12 @@ cdef struct _Cached:
|
||||||
|
|
||||||
cdef class Vocab:
|
cdef class Vocab:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cpdef readonly StringStore strings
|
cdef readonly StringStore strings
|
||||||
cpdef public Morphology morphology
|
cdef public Morphology morphology
|
||||||
cpdef public object vectors
|
cdef public object vectors
|
||||||
cpdef public object _lookups
|
cdef public object _lookups
|
||||||
cpdef public object writing_system
|
cdef public object writing_system
|
||||||
cpdef public object get_noun_chunks
|
cdef public object get_noun_chunks
|
||||||
cdef readonly int length
|
cdef readonly int length
|
||||||
cdef public object data_dir
|
cdef public object data_dir
|
||||||
cdef public object lex_attr_getters
|
cdef public object lex_attr_getters
|
||||||
|
|
|
@ -182,24 +182,25 @@ single corpus once and then divide it up into `train` and `dev` partitions.
|
||||||
This section defines settings and controls for the training and evaluation
|
This section defines settings and controls for the training and evaluation
|
||||||
process that are used when you run [`spacy train`](/api/cli#train).
|
process that are used when you run [`spacy train`](/api/cli#train).
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
|
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
|
||||||
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
||||||
| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||||
| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
|
| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
|
||||||
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
|
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
|
||||||
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
|
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
|
||||||
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
|
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
|
||||||
| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ |
|
| `annotating_components` | Pipeline component names that should set annotations on the predicted docs during training. See [here](/usage/training#annotating-components) for details. Defaults to `[]`. ~~List[str]~~ |
|
||||||
| `logger` | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
|
| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ |
|
||||||
| `max_epochs` | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~ |
|
| `logger` | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
|
||||||
| `max_steps` | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~ |
|
| `max_epochs` | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~ |
|
||||||
| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
| `max_steps` | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~ |
|
||||||
| `patience` | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~ |
|
| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
||||||
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
|
| `patience` | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~ |
|
||||||
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
|
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
|
||||||
| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ |
|
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
|
||||||
|
| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ |
|
||||||
|
|
||||||
### pretraining {#config-pretraining tag="section,optional"}
|
### pretraining {#config-pretraining tag="section,optional"}
|
||||||
|
|
||||||
|
|
|
@ -245,14 +245,14 @@ and call the optimizer, while the others simply increment the gradients.
|
||||||
> losses = trf.update(examples, sgd=optimizer)
|
> losses = trf.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `examples` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ |
|
| `examples` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `drop` | The dropout rate. ~~float~~ |
|
| `drop` | The dropout rate. ~~float~~ |
|
||||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||||
|
|
||||||
## Transformer.create_optimizer {#create_optimizer tag="method"}
|
## Transformer.create_optimizer {#create_optimizer tag="method"}
|
||||||
|
|
||||||
|
@ -493,6 +493,11 @@ This requires sentence boundaries to be set (e.g. by the
|
||||||
depending on the sentence lengths. However, it does provide the transformer with
|
depending on the sentence lengths. However, it does provide the transformer with
|
||||||
more meaningful windows to attend over.
|
more meaningful windows to attend over.
|
||||||
|
|
||||||
|
To set sentence boundaries with the `sentencizer` during training, add a
|
||||||
|
`sentencizer` to the beginning of the pipeline and include it in
|
||||||
|
[`[training.annotating_components]`](/usage/training#annotating-components) to
|
||||||
|
have it set the sentence boundaries before the `transformer` component runs.
|
||||||
|
|
||||||
### strided_spans.v1 {#strided_spans tag="registered function"}
|
### strided_spans.v1 {#strided_spans tag="registered function"}
|
||||||
|
|
||||||
> #### Example config
|
> #### Example config
|
||||||
|
|
|
@ -422,11 +422,11 @@ as-is. They are also excluded when calling
|
||||||
> #### Note on frozen components
|
> #### Note on frozen components
|
||||||
>
|
>
|
||||||
> Even though frozen components are not **updated** during training, they will
|
> Even though frozen components are not **updated** during training, they will
|
||||||
> still **run** during training and evaluation. This is very important, because
|
> still **run** during evaluation. This is very important, because they may
|
||||||
> they may still impact your model's performance – for instance, a sentence
|
> still impact your model's performance – for instance, a sentence boundary
|
||||||
> boundary detector can impact what the parser or entity recognizer considers a
|
> detector can impact what the parser or entity recognizer considers a valid
|
||||||
> valid parse. So the evaluation results should always reflect what your
|
> parse. So the evaluation results should always reflect what your pipeline will
|
||||||
> pipeline will produce at runtime.
|
> produce at runtime.
|
||||||
|
|
||||||
```ini
|
```ini
|
||||||
[nlp]
|
[nlp]
|
||||||
|
@ -463,6 +463,64 @@ replace_listeners = ["model.tok2vec"]
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
### Using predictions from preceding components {#annotating-components new="3.1"}
|
||||||
|
|
||||||
|
By default, components are updated in isolation during training, which means
|
||||||
|
that they don't see the predictions of any earlier components in the pipeline. A
|
||||||
|
component receives [`Example.predicted`](/api/example) as input and compares its
|
||||||
|
predictions to [`Example.reference`](/api/example) without saving its
|
||||||
|
annotations in the `predicted` doc.
|
||||||
|
|
||||||
|
Instead, if certain components should **set their annotations** during training,
|
||||||
|
use the setting `annotating_components` in the `[training]` block to specify a
|
||||||
|
list of components. For example, the feature `DEP` from the parser could be used
|
||||||
|
as a tagger feature by including `DEP` in the tok2vec `attrs` and including
|
||||||
|
`parser` in `annotating_components`:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### config.cfg (excerpt) {highlight="7,12"}
|
||||||
|
[nlp]
|
||||||
|
pipeline = ["parser", "tagger"]
|
||||||
|
|
||||||
|
[components.tagger.model.tok2vec.embed]
|
||||||
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
|
width = ${components.tagger.model.tok2vec.encode.width}
|
||||||
|
attrs = ["NORM","DEP"]
|
||||||
|
rows = [5000,2500]
|
||||||
|
include_static_vectors = false
|
||||||
|
|
||||||
|
[training]
|
||||||
|
annotating_components = ["parser"]
|
||||||
|
```
|
||||||
|
|
||||||
|
Any component in the pipeline can be included as an annotating component,
|
||||||
|
including frozen components. Frozen components can set annotations during
|
||||||
|
training just as they would set annotations during evaluation or when the final
|
||||||
|
pipeline is run. The config excerpt below shows how a frozen `ner` component and
|
||||||
|
a `sentencizer` can provide the required `doc.sents` and `doc.ents` for the
|
||||||
|
entity linker during training:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### config.cfg (excerpt)
|
||||||
|
[nlp]
|
||||||
|
pipeline = ["sentencizer", "ner", "entity_linker"]
|
||||||
|
|
||||||
|
[components.ner]
|
||||||
|
source = "en_core_web_sm"
|
||||||
|
|
||||||
|
[training]
|
||||||
|
frozen_components = ["ner"]
|
||||||
|
annotating_components = ["sentencizer", "ner"]
|
||||||
|
```
|
||||||
|
|
||||||
|
<Infobox variant="warning" title="Training speed with annotating components" id="annotating-components-speed">
|
||||||
|
|
||||||
|
Be aware that non-frozen annotating components with statistical models will
|
||||||
|
**run twice** on each batch, once to update the model and once to apply the
|
||||||
|
now-updated model to the predicted docs.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
### Using registered functions {#config-functions}
|
### Using registered functions {#config-functions}
|
||||||
|
|
||||||
The training configuration defined in the config file doesn't have to only
|
The training configuration defined in the config file doesn't have to only
|
||||||
|
|
|
@ -25,7 +25,13 @@
|
||||||
"code": "ca",
|
"code": "ca",
|
||||||
"name": "Catalan",
|
"name": "Catalan",
|
||||||
"example": "Això és una frase.",
|
"example": "Això és una frase.",
|
||||||
"has_examples": true
|
"has_examples": true,
|
||||||
|
"models": [
|
||||||
|
"ca_core_news_sm",
|
||||||
|
"ca_core_news_md",
|
||||||
|
"ca_core_news_lg",
|
||||||
|
"ca_core_news_trf"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"code": "cs",
|
"code": "cs",
|
||||||
|
@ -40,7 +46,8 @@
|
||||||
"models": [
|
"models": [
|
||||||
"da_core_news_sm",
|
"da_core_news_sm",
|
||||||
"da_core_news_md",
|
"da_core_news_md",
|
||||||
"da_core_news_lg"
|
"da_core_news_lg",
|
||||||
|
"da_core_news_trf"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -545,4 +552,4 @@
|
||||||
"url": "https://github.com/UniversalDependencies/UD_French-Sequoia/blob/master/LICENSE.txt"
|
"url": "https://github.com/UniversalDependencies/UD_French-Sequoia/blob/master/LICENSE.txt"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user