Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-08-07 19:17:35 +02:00
commit bbc3e96690
145 changed files with 3897 additions and 3428 deletions

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a22,<8.0.0a30",
"thinc>=8.0.0a23,<8.0.0a30",
"blis>=0.4.0,<0.5.0",
"pytokenizations",
"smart_open>=2.0.0,<3.0.0"

View File

@ -1,7 +1,7 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a22,<8.0.0a30
thinc>=8.0.0a23,<8.0.0a30
blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0

View File

@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a22,<8.0.0a30
thinc>=8.0.0a23,<8.0.0a30
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a22,<8.0.0a30
thinc>=8.0.0a23,<8.0.0a30
blis>=0.4.0,<0.5.0
wasabi>=0.7.1,<1.1.0
srsly>=2.1.0,<3.0.0

View File

@ -17,23 +17,28 @@ from .. import displacy
def evaluate_cli(
# fmt: off
model: str = Arg(..., help="Model name or path"),
data_path: Path = Arg(..., help="Location of JSON-formatted evaluation data", exists=True),
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
# fmt: on
):
"""
Evaluate a model. To render a sample of parses in a HTML file, set an
output directory as the displacy_path argument.
Evaluate a model. Expects a loadable spaCy model and evaluation data in the
binary .spacy format. The --gold-preproc option sets up the evaluation
examples with gold-standard sentences and tokens for the predictions. Gold
preprocessing helps the annotations align to the tokenization, and may
result in sequences of more consistent length. However, it may reduce
runtime accuracy due to train/test skew. To render a sample of dependency
parses in a HTML file, set as output directory as the displacy_path argument.
"""
evaluate(
model,
data_path,
output=output,
gpu_id=gpu_id,
use_gpu=use_gpu,
gold_preproc=gold_preproc,
displacy_path=displacy_path,
displacy_limit=displacy_limit,
@ -45,7 +50,7 @@ def evaluate(
model: str,
data_path: Path,
output: Optional[Path] = None,
gpu_id: int = -1,
use_gpu: int = -1,
gold_preproc: bool = False,
displacy_path: Optional[Path] = None,
displacy_limit: int = 25,
@ -53,8 +58,8 @@ def evaluate(
) -> Scorer:
msg = Printer(no_print=silent, pretty=not silent)
fix_random_seed()
if gpu_id >= 0:
require_gpu(gpu_id)
if use_gpu >= 0:
require_gpu(use_gpu)
util.set_env_log(False)
data_path = util.ensure_path(data_path)
output_path = util.ensure_path(output)

View File

@ -19,9 +19,6 @@ after_pipeline_creation = null
[nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[components]
# Training hyper-parameters and additional features.

View File

@ -169,9 +169,9 @@ class Errors:
"training a named entity recognizer, also make sure that none of "
"your annotated entity spans have leading or trailing whitespace "
"or punctuation. "
"You can also use the experimental `debug-data` command to "
"You can also use the experimental `debug data` command to "
"validate your JSON-formatted training data. For details, run:\n"
"python -m spacy debug-data --help")
"python -m spacy debug data --help")
E025 = ("String is too long: {length} characters. Max is 2**30.")
E026 = ("Error accessing token at position {i}: out of bounds in Doc of "
"length {length}.")
@ -510,7 +510,7 @@ class Errors:
E952 = ("The section '{name}' is not a valid section in the provided config.")
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
E954 = ("The Tok2Vec listener did not receive a valid input.")
E955 = ("Can't find table '{table}' for language '{lang}' in spacy-lookups-data.")
E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.")
E956 = ("Can't find component '{name}' in [components] block in the config. "
"Available components: {opts}")
E957 = ("Writing directly to Language.factories isn't needed anymore in "
@ -633,6 +633,11 @@ class Errors:
E1001 = ("Target token outside of matched span for match with tokens "
"'{span}' and offset '{index}' matched by patterns '{patterns}'.")
E1002 = ("Span index out of range.")
E1003 = ("Unsupported lemmatizer mode '{mode}'.")
E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
"Required tables '{tables}', found '{found}'. If you are not "
"providing custom lookups, make sure you have the package "
"spacy-lookups-data installed.")
@add_codes

View File

@ -20,7 +20,7 @@ def create_docbin_reader(
class Corpus:
"""Iterate Example objects from a file or directory of DocBin (.spacy)
formated data files.
formatted data files.
path (Path): The directory or filename to read from.
gold_preproc (bool): Whether to set up the Example object with gold-standard
@ -39,7 +39,7 @@ class Corpus:
def __init__(
self,
path,
path: Union[str, Path],
*,
limit: int = 0,
gold_preproc: bool = False,
@ -136,8 +136,7 @@ class Corpus:
for loc in locs:
loc = util.ensure_path(loc)
if loc.parts[-1].endswith(".spacy"):
with loc.open("rb") as file_:
doc_bin = DocBin().from_bytes(file_.read())
doc_bin = DocBin().from_disk(loc)
docs = doc_bin.get_docs(vocab)
for doc in docs:
if len(doc):

View File

@ -1,38 +1,17 @@
from typing import Callable
from thinc.api import Config
from typing import Optional
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import GreekLemmatizer
from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from ...lookups import load_lookups
from .lemmatizer import GreekLemmatizer
from ...lookups import Lookups
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
[nlp.lemmatizer]
@lemmatizers = "spacy.el.GreekLemmatizer"
"""
@registry.lemmatizers("spacy.el.GreekLemmatizer")
def create_lemmatizer() -> Callable[[Language], GreekLemmatizer]:
tables = ["lemma_index", "lemma_exc", "lemma_rules"]
def lemmatizer_factory(nlp: Language) -> GreekLemmatizer:
lookups = load_lookups(lang=nlp.lang, tables=tables)
return GreekLemmatizer(lookups=lookups)
return lemmatizer_factory
class GreekDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
@ -47,4 +26,22 @@ class Greek(Language):
Defaults = GreekDefaults
@Greek.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = GreekLemmatizer.load_lookups(nlp.lang, mode, lookups)
return GreekLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Greek"]

View File

@ -1,6 +1,7 @@
from typing import Dict, List
from typing import List
from ...lemmatizer import Lemmatizer
from ...pipeline import Lemmatizer
from ...tokens import Token
class GreekLemmatizer(Lemmatizer):
@ -14,13 +15,27 @@ class GreekLemmatizer(Lemmatizer):
not applicable for Greek language.
"""
def lemmatize(
self,
string: str,
index: Dict[str, List[str]],
exceptions: Dict[str, Dict[str, List[str]]],
rules: Dict[str, List[List[str]]],
) -> List[str]:
def rule_lemmatize(self, token: Token) -> List[str]:
"""Lemmatize using a rule-based approach.
token (Token): The token to lemmatize.
RETURNS (list): The available lemmas for the string.
"""
cache_key = (token.lower, token.pos)
if cache_key in self.cache:
return self.cache[cache_key]
string = token.text
univ_pos = token.pos_.lower()
if univ_pos in ("", "eol", "space"):
return [string.lower()]
index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {})
index = index_table.get(univ_pos, {})
exceptions = exc_table.get(univ_pos, {})
rules = rules_table.get(univ_pos, {})
string = string.lower()
forms = []
if string in index:
@ -42,4 +57,6 @@ class GreekLemmatizer(Lemmatizer):
forms.extend(oov_forms)
if not forms:
forms.append(string)
return list(set(forms))
forms = list(set(forms))
self.cache[cache_key] = forms
return forms

View File

@ -1,39 +1,18 @@
from typing import Callable
from thinc.api import Config
from typing import Optional
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .lemmatizer import is_base_form
from .punctuation import TOKENIZER_INFIXES
from .lemmatizer import EnglishLemmatizer
from ...language import Language
from ...lemmatizer import Lemmatizer
from ...lookups import load_lookups
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
[nlp.lemmatizer]
@lemmatizers = "spacy.en.EnglishLemmatizer"
"""
@registry.lemmatizers("spacy.en.EnglishLemmatizer")
def create_lemmatizer() -> Callable[[Language], Lemmatizer]:
tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
def lemmatizer_factory(nlp: Language) -> Lemmatizer:
lookups = load_lookups(lang=nlp.lang, tables=tables)
return Lemmatizer(lookups=lookups, is_base_form=is_base_form)
return lemmatizer_factory
from ...lookups import Lookups
class EnglishDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS
@ -46,4 +25,22 @@ class English(Language):
Defaults = EnglishDefaults
@English.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = EnglishLemmatizer.load_lookups(nlp.lang, mode, lookups)
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["English"]

View File

@ -1,36 +1,43 @@
from typing import Optional
from ...pipeline import Lemmatizer
from ...tokens import Token
def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
"""
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
univ_pos (unicode / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
class EnglishLemmatizer(Lemmatizer):
"""English lemmatizer. Only overrides is_base_form.
"""
if morphology is None:
morphology = {}
if univ_pos == "noun" and morphology.get("Number") == "sing":
return True
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == "verb" and (
morphology.get("VerbForm") == "fin"
and morphology.get("Tense") == "pres"
and morphology.get("Number") is None
):
return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True
elif morphology.get("VerbForm") == "inf":
return True
elif morphology.get("VerbForm") == "none":
return True
elif morphology.get("Degree") == "pos":
return True
else:
return False
def is_base_form(self, token: Token) -> bool:
"""
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
univ_pos (unicode / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
"""
univ_pos = token.pos_.lower()
morphology = token.morph.to_dict()
if univ_pos == "noun" and morphology.get("Number") == "Sing":
return True
elif univ_pos == "verb" and morphology.get("VerbForm") == "Inf":
return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == "verb" and (
morphology.get("VerbForm") == "Fin"
and morphology.get("Tense") == "Pres"
and morphology.get("Number") is None
):
return True
elif univ_pos == "adj" and morphology.get("Degree") == "Pos":
return True
elif morphology.get("VerbForm") == "Inf":
return True
elif morphology.get("VerbForm") == "None":
return True
elif morphology.get("Degree") == "Pos":
return True
else:
return False

View File

@ -1,5 +1,6 @@
from typing import Callable
from thinc.api import Config
from typing import Optional
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
@ -7,33 +8,12 @@ from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .lemmatizer import FrenchLemmatizer, is_base_form
from ...lookups import load_lookups
from .lemmatizer import FrenchLemmatizer
from ...lookups import Lookups
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
[nlp.lemmatizer]
@lemmatizers = "spacy.fr.FrenchLemmatizer"
"""
@registry.lemmatizers("spacy.fr.FrenchLemmatizer")
def create_lemmatizer() -> Callable[[Language], FrenchLemmatizer]:
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
def lemmatizer_factory(nlp: Language) -> FrenchLemmatizer:
lookups = load_lookups(lang=nlp.lang, tables=tables)
return FrenchLemmatizer(lookups=lookups, is_base_form=is_base_form)
return lemmatizer_factory
class FrenchDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
@ -49,4 +29,22 @@ class French(Language):
Defaults = FrenchDefaults
@French.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = FrenchLemmatizer.load_lookups(nlp.lang, mode, lookups)
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["French"]

View File

@ -1,8 +1,7 @@
from typing import Optional, List, Dict
from typing import List, Dict
from ...lemmatizer import Lemmatizer
from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
from ...symbols import SCONJ, CCONJ
from ...pipeline import Lemmatizer
from ...tokens import Token
class FrenchLemmatizer(Lemmatizer):
@ -15,65 +14,55 @@ class FrenchLemmatizer(Lemmatizer):
the lookup table.
"""
def __call__(
self, string: str, univ_pos: str, morphology: Optional[dict] = None
) -> List[str]:
lookup_table = self.lookups.get_table("lemma_lookup", {})
if "lemma_rules" not in self.lookups:
return [lookup_table.get(string, string)]
if univ_pos in (NOUN, "NOUN", "noun"):
univ_pos = "noun"
elif univ_pos in (VERB, "VERB", "verb"):
univ_pos = "verb"
elif univ_pos in (ADJ, "ADJ", "adj"):
univ_pos = "adj"
elif univ_pos in (ADP, "ADP", "adp"):
univ_pos = "adp"
elif univ_pos in (ADV, "ADV", "adv"):
univ_pos = "adv"
elif univ_pos in (AUX, "AUX", "aux"):
univ_pos = "aux"
elif univ_pos in (CCONJ, "CCONJ", "cconj"):
univ_pos = "cconj"
elif univ_pos in (DET, "DET", "det"):
univ_pos = "det"
elif univ_pos in (PRON, "PRON", "pron"):
univ_pos = "pron"
elif univ_pos in (PUNCT, "PUNCT", "punct"):
univ_pos = "punct"
elif univ_pos in (SCONJ, "SCONJ", "sconj"):
univ_pos = "sconj"
@classmethod
def get_lookups_config(cls, mode: str) -> Dict:
if mode == "rule":
return {
"required_tables": [
"lemma_lookup",
"lemma_rules",
"lemma_exc",
"lemma_index",
],
"optional_tables": [],
}
else:
return [self.lookup(string)]
return super().get_lookups_config(mode)
def rule_lemmatize(self, token: Token) -> List[str]:
cache_key = (token.orth, token.pos)
if cache_key in self.cache:
return self.cache[cache_key]
string = token.text
univ_pos = token.pos_.lower()
if univ_pos in ("", "eol", "space"):
return [string.lower()]
elif "lemma_rules" not in self.lookups or univ_pos not in (
"noun",
"verb",
"adj",
"adp",
"adv",
"aux",
"cconj",
"det",
"pron",
"punct",
"sconj",
):
return self.lookup_lemmatize(token)
index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {})
lemmas = self.lemmatize(
string,
index_table.get(univ_pos, {}),
exc_table.get(univ_pos, {}),
rules_table.get(univ_pos, []),
)
return lemmas
def lookup(self, string: str, orth: Optional[int] = None) -> str:
lookup_table = self.lookups.get_table("lemma_lookup", {})
if orth is not None and orth in lookup_table:
return lookup_table[orth][0]
return string
def lemmatize(
self,
string: str,
index: Dict[str, List[str]],
exceptions: Dict[str, Dict[str, List[str]]],
rules: Dict[str, List[List[str]]],
) -> List[str]:
lookup_table = self.lookups.get_table("lemma_lookup", {})
index = index_table.get(univ_pos, {})
exceptions = exc_table.get(univ_pos, {})
rules = rules_table.get(univ_pos, [])
string = string.lower()
forms = []
if string in index:
forms.append(string)
self.cache[cache_key] = forms
return forms
forms.extend(exceptions.get(string, []))
oov_forms = []
@ -90,45 +79,9 @@ class FrenchLemmatizer(Lemmatizer):
if not forms:
forms.extend(oov_forms)
if not forms and string in lookup_table.keys():
forms.append(lookup_table[string][0])
forms.append(self.lookup_lemmatize(token)[0])
if not forms:
forms.append(string)
return list(set(forms))
def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
"""
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
"""
morphology = {} if morphology is None else morphology
others = [
key
for key in morphology
if key not in (POS, "Number", "POS", "VerbForm", "Tense")
]
if univ_pos == "noun" and morphology.get("Number") == "sing":
return True
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == "verb" and (
morphology.get("VerbForm") == "fin"
and morphology.get("Tense") == "pres"
and morphology.get("Number") is None
and not others
):
return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True
elif "VerbForm=inf" in morphology:
return True
elif "VerbForm=none" in morphology:
return True
elif "Number=sing" in morphology:
return True
elif "Degree=pos" in morphology:
return True
else:
return False
forms = list(set(forms))
self.cache[cache_key] = forms
return forms

View File

@ -38,8 +38,6 @@ def create_tokenizer(split_mode: Optional[str] = None):
class JapaneseTokenizer(DummyTokenizer):
def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
self.vocab = nlp.vocab
# TODO: is this the right way to do it?
self.vocab.morphology.load_tag_map(TAG_MAP)
self.split_mode = split_mode
self.tokenizer = try_sudachi_import(self.split_mode)

View File

@ -7,6 +7,7 @@ from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...tokens import Doc
from ...compat import copy_reg
from ...symbols import POS
from ...util import DummyTokenizer, registry
@ -29,8 +30,6 @@ def create_tokenizer():
class KoreanTokenizer(DummyTokenizer):
def __init__(self, nlp: Optional[Language] = None):
self.vocab = nlp.vocab
# TODO: is this the right way to do it?
self.vocab.morphology.load_tag_map(TAG_MAP)
MeCab = try_mecab_import()
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
@ -44,6 +43,7 @@ class KoreanTokenizer(DummyTokenizer):
for token, dtoken in zip(doc, dtokens):
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
token.pos = TAG_MAP[token.tag_][POS]
token.lemma_ = dtoken["lemma"]
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
return doc

View File

@ -1,5 +1,6 @@
from typing import Callable
from thinc.api import Config
from typing import Optional
from thinc.api import Model
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
@ -7,32 +8,11 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from .lemmatizer import DutchLemmatizer
from ...lookups import load_lookups
from ...lookups import Lookups
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
[nlp.lemmatizer]
@lemmatizers = "spacy.nl.DutchLemmatizer"
"""
@registry.lemmatizers("spacy.nl.DutchLemmatizer")
def create_lemmatizer() -> Callable[[Language], DutchLemmatizer]:
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
def lemmatizer_factory(nlp: Language) -> DutchLemmatizer:
lookups = load_lookups(lang=nlp.lang, tables=tables)
return DutchLemmatizer(lookups=lookups)
return lemmatizer_factory
class DutchDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
@ -46,4 +26,22 @@ class Dutch(Language):
Defaults = DutchDefaults
@Dutch.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = DutchLemmatizer.load_lookups(nlp.lang, mode, lookups)
return DutchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Dutch"]

View File

@ -1,44 +1,34 @@
from typing import Optional, List, Dict, Tuple
from typing import List, Dict
from ...lemmatizer import Lemmatizer
from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
from ...pipeline import Lemmatizer
from ...tokens import Token
class DutchLemmatizer(Lemmatizer):
# Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
univ_pos_name_variants = {
NOUN: "noun",
"NOUN": "noun",
"noun": "noun",
VERB: "verb",
"VERB": "verb",
"verb": "verb",
AUX: "verb",
"AUX": "verb",
"aux": "verb",
ADJ: "adj",
"ADJ": "adj",
"adj": "adj",
ADV: "adv",
"ADV": "adv",
"adv": "adv",
PRON: "pron",
"PRON": "pron",
"pron": "pron",
DET: "det",
"DET": "det",
"det": "det",
ADP: "adp",
"ADP": "adp",
"adp": "adp",
NUM: "num",
"NUM": "num",
"num": "num",
}
@classmethod
def get_lookups_config(cls, mode: str) -> Dict:
if mode == "rule":
return {
"required_tables": [
"lemma_lookup",
"lemma_rules",
"lemma_exc",
"lemma_index",
],
}
else:
return super().get_lookups_config(mode)
def __call__(
self, string: str, univ_pos: str, morphology: Optional[dict] = None
) -> List[str]:
def lookup_lemmatize(self, token: Token) -> List[str]:
"""Overrides parent method so that a lowercased version of the string
is used to search the lookup table. This is necessary because our
lookup table consists entirely of lowercase keys."""
lookup_table = self.lookups.get_table("lemma_lookup", {})
string = token.text.lower()
return [lookup_table.get(string, string)]
# Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
def rule_lemmatize(self, token: Token) -> List[str]:
# Difference 1: self.rules is assumed to be non-None, so no
# 'is None' check required.
# String lowercased from the get-go. All lemmatization results in
@ -46,74 +36,61 @@ class DutchLemmatizer(Lemmatizer):
# any problems, and it keeps the exceptions indexes small. If this
# creates problems for proper nouns, we can introduce a check for
# univ_pos == "PROPN".
string = string.lower()
try:
univ_pos = self.univ_pos_name_variants[univ_pos]
except KeyError:
# Because PROPN not in self.univ_pos_name_variants, proper names
# are not lemmatized. They are lowercased, however.
return [string]
# if string in self.lemma_index.get(univ_pos)
cache_key = (token.lower, token.pos)
if cache_key in self.cache:
return self.cache[cache_key]
string = token.text
univ_pos = token.pos_.lower()
if univ_pos in ("", "eol", "space"):
forms = [string.lower()]
self.cache[cache_key] = forms
return forms
index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {})
index = index_table.get(univ_pos, {})
exceptions = exc_table.get(univ_pos, {})
rules = rules_table.get(univ_pos, {})
string = string.lower()
if univ_pos not in (
"noun",
"verb",
"aux",
"adj",
"adv",
"pron",
"det",
"adp",
"num",
):
forms = [string]
self.cache[cache_key] = forms
return forms
lemma_index = index_table.get(univ_pos, {})
# string is already lemma
if string in lemma_index:
return [string]
forms = [string]
self.cache[cache_key] = forms
return forms
exc_table = self.lookups.get_table("lemma_exc", {})
exceptions = exc_table.get(univ_pos, {})
# string is irregular token contained in exceptions index.
try:
lemma = exceptions[string]
return [lemma[0]]
forms = [exceptions[string][0]]
self.cache[cache_key] = forms
return forms
except KeyError:
pass
# string corresponds to key in lookup table
lookup_table = self.lookups.get_table("lemma_lookup", {})
looked_up_lemma = lookup_table.get(string)
if looked_up_lemma and looked_up_lemma in lemma_index:
return [looked_up_lemma]
forms = [looked_up_lemma]
self.cache[cache_key] = forms
return forms
rules_table = self.lookups.get_table("lemma_rules", {})
forms, is_known = self.lemmatize(
string, lemma_index, exceptions, rules_table.get(univ_pos, [])
)
# Back-off through remaining return value candidates.
if forms:
if is_known:
return forms
else:
for form in forms:
if form in exceptions:
return [form]
if looked_up_lemma:
return [looked_up_lemma]
else:
return forms
elif looked_up_lemma:
return [looked_up_lemma]
else:
return [string]
# Overrides parent method so that a lowercased version of the string is
# used to search the lookup table. This is necessary because our lookup
# table consists entirely of lowercase keys.
def lookup(self, string: str, orth: Optional[int] = None) -> str:
lookup_table = self.lookups.get_table("lemma_lookup", {})
string = string.lower()
if orth is not None:
return lookup_table.get(orth, string)
else:
return lookup_table.get(string, string)
# Reimplemented to focus more on application of suffix rules and to return
# as early as possible.
def lemmatize(
self,
string: str,
index: Dict[str, List[str]],
exceptions: Dict[str, Dict[str, List[str]]],
rules: Dict[str, List[List[str]]],
) -> Tuple[List[str], bool]:
# returns (forms, is_known: bool)
oov_forms = []
for old, new in rules:
if string.endswith(old):
@ -121,7 +98,31 @@ class DutchLemmatizer(Lemmatizer):
if not form:
pass
elif form in index:
return [form], True # True = Is known (is lemma)
forms = [form]
self.cache[cache_key] = forms
return forms
else:
oov_forms.append(form)
return list(set(oov_forms)), False
forms = list(set(oov_forms))
# Back-off through remaining return value candidates.
if forms:
for form in forms:
if form in exceptions:
forms = [form]
self.cache[cache_key] = forms
return forms
if looked_up_lemma:
forms = [looked_up_lemma]
self.cache[cache_key] = forms
return forms
else:
self.cache[cache_key] = forms
return forms
elif looked_up_lemma:
forms = [looked_up_lemma]
self.cache[cache_key] = forms
return forms
else:
forms = [string]
self.cache[cache_key] = forms
return forms

View File

@ -1,5 +1,6 @@
from typing import Callable
from thinc.api import Config
from typing import Optional
from thinc.api import Model
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
@ -7,42 +8,16 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import PolishLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...lookups import load_lookups
from ...lookups import Lookups
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
[nlp.lemmatizer]
@lemmatizers = "spacy.pl.PolishLemmatizer"
"""
TOKENIZER_EXCEPTIONS = {
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
}
@registry.lemmatizers("spacy.pl.PolishLemmatizer")
def create_lemmatizer() -> Callable[[Language], PolishLemmatizer]:
# fmt: off
tables = [
"lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv",
"lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num",
"lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"
]
# fmt: on
def lemmatizer_factory(nlp: Language) -> PolishLemmatizer:
lookups = load_lookups(lang=nlp.lang, tables=tables)
return PolishLemmatizer(lookups=lookups)
return lemmatizer_factory
class PolishDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
@ -56,4 +31,22 @@ class Polish(Language):
Defaults = PolishDefaults
@Polish.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "lookup", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = PolishLemmatizer.load_lookups(nlp.lang, mode, lookups)
return PolishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Polish"]

View File

@ -1,7 +1,7 @@
from typing import Optional, List, Dict
from typing import List, Dict
from ...lemmatizer import Lemmatizer
from ...parts_of_speech import NAMES
from ...pipeline import Lemmatizer
from ...tokens import Token
class PolishLemmatizer(Lemmatizer):
@ -9,12 +9,30 @@ class PolishLemmatizer(Lemmatizer):
# dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS.
# It utilizes some prefix based improvements for verb and adjectives
# lemmatization, as well as case-sensitive lemmatization for nouns.
def __call__(
self, string: str, univ_pos: str, morphology: Optional[dict] = None
) -> List[str]:
if isinstance(univ_pos, int):
univ_pos = NAMES.get(univ_pos, "X")
univ_pos = univ_pos.upper()
@classmethod
def get_lookups_config(cls, mode: str) -> Dict:
if mode == "lookup":
return {
"required_tables": [
"lemma_lookup_adj",
"lemma_lookup_adp",
"lemma_lookup_adv",
"lemma_lookup_aux",
"lemma_lookup_noun",
"lemma_lookup_num",
"lemma_lookup_part",
"lemma_lookup_pron",
"lemma_lookup_verb",
]
}
else:
return super().get_lookups_config(mode)
def lookup_lemmatize(self, token: Token) -> List[str]:
string = token.text
univ_pos = token.pos_
morphology = token.morph.to_dict()
lookup_pos = univ_pos.lower()
if univ_pos == "PROPN":
lookup_pos = "noun"
@ -71,15 +89,3 @@ class PolishLemmatizer(Lemmatizer):
return [lookup_table[string]]
return [string.lower()]
return [lookup_table.get(string, string)]
def lookup(self, string: str, orth: Optional[int] = None) -> str:
return string.lower()
def lemmatize(
self,
string: str,
index: Dict[str, List[str]],
exceptions: Dict[str, Dict[str, List[str]]],
rules: Dict[str, List[List[str]]],
) -> List[str]:
raise NotImplementedError

View File

@ -1,32 +1,16 @@
from typing import Callable
from thinc.api import Config
from typing import Optional
from thinc.api import Model
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer
from ...util import registry
from ...language import Language
DEFAULT_CONFIG = """
[nlp]
[nlp.lemmatizer]
@lemmatizers = "spacy.ru.RussianLemmatizer"
"""
@registry.lemmatizers("spacy.ru.RussianLemmatizer")
def create_lemmatizer() -> Callable[[Language], RussianLemmatizer]:
def lemmatizer_factory(nlp: Language) -> RussianLemmatizer:
return RussianLemmatizer()
return lemmatizer_factory
from ...lookups import Lookups
class RussianDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
@ -37,4 +21,21 @@ class Russian(Language):
Defaults = RussianDefaults
@Russian.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
return RussianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Russian"]

View File

@ -1,8 +1,12 @@
from typing import Optional, Tuple, Dict, List
from typing import Optional, List, Dict, Tuple
from thinc.api import Model
from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
from ...lemmatizer import Lemmatizer
from ...lookups import Lookups
from ...pipeline import Lemmatizer
from ...symbols import POS
from ...tokens import Token
from ...vocab import Vocab
PUNCT_RULES = {"«": '"', "»": '"'}
@ -11,8 +15,17 @@ PUNCT_RULES = {"«": '"', "»": '"'}
class RussianLemmatizer(Lemmatizer):
_morph = None
def __init__(self, lookups: Optional[Lookups] = None) -> None:
super(RussianLemmatizer, self).__init__(lookups)
def __init__(
self,
vocab: Vocab,
model: Optional[Model],
name: str = "lemmatizer",
*,
mode: str = "pymorphy2",
lookups: Optional[Lookups] = None,
) -> None:
super().__init__(vocab, model, name, mode=mode, lookups=lookups)
try:
from pymorphy2 import MorphAnalyzer
except ImportError:
@ -25,10 +38,10 @@ class RussianLemmatizer(Lemmatizer):
if RussianLemmatizer._morph is None:
RussianLemmatizer._morph = MorphAnalyzer()
def __call__(
self, string: str, univ_pos: str, morphology: Optional[dict] = None
) -> List[str]:
univ_pos = self.normalize_univ_pos(univ_pos)
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
string = token.text
univ_pos = token.pos_
morphology = token.morph.to_dict()
if univ_pos == "PUNCT":
return [PUNCT_RULES.get(string, string)]
if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
@ -81,25 +94,8 @@ class RussianLemmatizer(Lemmatizer):
return [string.lower()]
return list(set([analysis.normal_form for analysis in filtered_analyses]))
@staticmethod
def normalize_univ_pos(univ_pos: str) -> Optional[str]:
if isinstance(univ_pos, str):
return univ_pos.upper()
symbols_to_str = {
ADJ: "ADJ",
DET: "DET",
NOUN: "NOUN",
NUM: "NUM",
PRON: "PRON",
PROPN: "PROPN",
PUNCT: "PUNCT",
VERB: "VERB",
}
if univ_pos in symbols_to_str:
return symbols_to_str[univ_pos]
return None
def lookup(self, string: str, orth: Optional[int] = None) -> str:
def lookup_lemmatize(self, token: Token) -> List[str]:
string = token.text
analyses = self._morph.parse(string)
if len(analyses) == 1:
return analyses[0].normal_form

View File

@ -1,32 +1,16 @@
from typing import Callable
from thinc.api import Config
from typing import Optional
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...util import registry
from ...language import Language
from .lemmatizer import UkrainianLemmatizer
DEFAULT_CONFIG = """
[nlp]
[nlp.lemmatizer]
@lemmatizers = "spacy.uk.UkrainianLemmatizer"
"""
@registry.lemmatizers("spacy.uk.UkrainianLemmatizer")
def create_ukrainian_lemmatizer() -> Callable[[Language], UkrainianLemmatizer]:
def lemmatizer_factory(nlp: Language) -> UkrainianLemmatizer:
return UkrainianLemmatizer()
return lemmatizer_factory
from ...language import Language
from ...lookups import Lookups
class UkrainianDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
@ -37,4 +21,21 @@ class Ukrainian(Language):
Defaults = UkrainianDefaults
@Ukrainian.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Ukrainian"]

View File

@ -1,187 +1,30 @@
from typing import Optional, List, Tuple, Dict
from typing import Optional
from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
from thinc.api import Model
from ..ru.lemmatizer import RussianLemmatizer
from ...lookups import Lookups
from ...lemmatizer import Lemmatizer
from ...vocab import Vocab
PUNCT_RULES = {"«": '"', "»": '"'}
class UkrainianLemmatizer(Lemmatizer):
_morph = None
def __init__(self, lookups: Optional[Lookups] = None) -> None:
super(UkrainianLemmatizer, self).__init__(lookups)
class UkrainianLemmatizer(RussianLemmatizer):
def __init__(
self,
vocab: Vocab,
model: Optional[Model],
name: str = "lemmatizer",
*,
mode: str = "pymorphy2",
lookups: Optional[Lookups] = None,
) -> None:
super().__init__(vocab, model, name, mode=mode, lookups=lookups)
try:
from pymorphy2 import MorphAnalyzer
if UkrainianLemmatizer._morph is None:
UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
except (ImportError, TypeError):
except ImportError:
raise ImportError(
"The Ukrainian lemmatizer requires the pymorphy2 library and "
'dictionaries: try to fix it with "pip uninstall pymorphy2" and'
'"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
) from None
def __call__(
self, string: str, univ_pos: str, morphology: Optional[dict] = None
) -> List[str]:
univ_pos = self.normalize_univ_pos(univ_pos)
if univ_pos == "PUNCT":
return [PUNCT_RULES.get(string, string)]
if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
# Skip unchangeable pos
return [string.lower()]
analyses = self._morph.parse(string)
filtered_analyses = []
for analysis in analyses:
if not analysis.is_known:
# Skip suggested parse variant for unknown word for pymorphy
continue
analysis_pos, _ = oc2ud(str(analysis.tag))
if analysis_pos == univ_pos or (
analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
):
filtered_analyses.append(analysis)
if not len(filtered_analyses):
return [string.lower()]
if morphology is None or (len(morphology) == 1 and POS in morphology):
return list(set([analysis.normal_form for analysis in filtered_analyses]))
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
features_to_compare = ["Case", "Number", "Gender"]
elif univ_pos == "NUM":
features_to_compare = ["Case", "Gender"]
elif univ_pos == "PRON":
features_to_compare = ["Case", "Number", "Gender", "Person"]
else: # VERB
features_to_compare = [
"Aspect",
"Gender",
"Mood",
"Number",
"Tense",
"VerbForm",
"Voice",
]
analyses, filtered_analyses = filtered_analyses, []
for analysis in analyses:
_, analysis_morph = oc2ud(str(analysis.tag))
for feature in features_to_compare:
if (
feature in morphology
and feature in analysis_morph
and morphology[feature].lower() != analysis_morph[feature].lower()
):
break
else:
filtered_analyses.append(analysis)
if not len(filtered_analyses):
return [string.lower()]
return list(set([analysis.normal_form for analysis in filtered_analyses]))
@staticmethod
def normalize_univ_pos(univ_pos: str) -> Optional[str]:
if isinstance(univ_pos, str):
return univ_pos.upper()
symbols_to_str = {
ADJ: "ADJ",
DET: "DET",
NOUN: "NOUN",
NUM: "NUM",
PRON: "PRON",
PROPN: "PROPN",
PUNCT: "PUNCT",
VERB: "VERB",
}
if univ_pos in symbols_to_str:
return symbols_to_str[univ_pos]
return None
def lookup(self, string: str, orth: Optional[int] = None) -> str:
analyses = self._morph.parse(string)
if len(analyses) == 1:
return analyses[0].normal_form
return string
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
gram_map = {
"_POS": {
"ADJF": "ADJ",
"ADJS": "ADJ",
"ADVB": "ADV",
"Apro": "DET",
"COMP": "ADJ", # Can also be an ADV - unchangeable
"CONJ": "CCONJ", # Can also be a SCONJ - both unchangeable ones
"GRND": "VERB",
"INFN": "VERB",
"INTJ": "INTJ",
"NOUN": "NOUN",
"NPRO": "PRON",
"NUMR": "NUM",
"NUMB": "NUM",
"PNCT": "PUNCT",
"PRCL": "PART",
"PREP": "ADP",
"PRTF": "VERB",
"PRTS": "VERB",
"VERB": "VERB",
},
"Animacy": {"anim": "Anim", "inan": "Inan"},
"Aspect": {"impf": "Imp", "perf": "Perf"},
"Case": {
"ablt": "Ins",
"accs": "Acc",
"datv": "Dat",
"gen1": "Gen",
"gen2": "Gen",
"gent": "Gen",
"loc2": "Loc",
"loct": "Loc",
"nomn": "Nom",
"voct": "Voc",
},
"Degree": {"COMP": "Cmp", "Supr": "Sup"},
"Gender": {"femn": "Fem", "masc": "Masc", "neut": "Neut"},
"Mood": {"impr": "Imp", "indc": "Ind"},
"Number": {"plur": "Plur", "sing": "Sing"},
"NumForm": {"NUMB": "Digit"},
"Person": {"1per": "1", "2per": "2", "3per": "3", "excl": "2", "incl": "1"},
"Tense": {"futr": "Fut", "past": "Past", "pres": "Pres"},
"Variant": {"ADJS": "Brev", "PRTS": "Brev"},
"VerbForm": {
"GRND": "Conv",
"INFN": "Inf",
"PRTF": "Part",
"PRTS": "Part",
"VERB": "Fin",
},
"Voice": {"actv": "Act", "pssv": "Pass"},
"Abbr": {"Abbr": "Yes"},
}
pos = "X"
morphology = dict()
unmatched = set()
grams = oc_tag.replace(" ", ",").split(",")
for gram in grams:
match = False
for categ, gmap in sorted(gram_map.items()):
if gram in gmap:
match = True
if categ == "_POS":
pos = gmap[gram]
else:
morphology[categ] = gmap[gram]
if not match:
unmatched.add(gram)
while len(unmatched) > 0:
gram = unmatched.pop()
if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
pos = "PROPN"
elif gram == "Auxt":
pos = "AUX"
elif gram == "Pltm":
morphology["Number"] = "Ptan"
return pos, morphology
if UkrainianLemmatizer._morph is None:
UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")

View File

@ -29,7 +29,6 @@ from .lang.punctuation import TOKENIZER_INFIXES
from .tokens import Doc
from .lookups import load_lookups
from .tokenizer import Tokenizer
from .lemmatizer import Lemmatizer
from .errors import Errors, Warnings
from .schemas import ConfigSchema
from .git_info import GIT_VERSION
@ -87,22 +86,6 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
return tokenizer_factory
@registry.lemmatizers("spacy.Lemmatizer.v1")
def create_lemmatizer() -> Callable[["Language"], "Lemmatizer"]:
"""Registered function to create a lemmatizer. Returns a factory that takes
the nlp object and returns a Lemmatizer instance with data loaded in from
spacy-lookups-data, if the package is installed.
"""
# TODO: Will be replaced when the lemmatizer becomes a pipeline component
tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
def lemmatizer_factory(nlp: "Language") -> "Lemmatizer":
lookups = load_lookups(lang=nlp.lang, tables=tables, strict=False)
return Lemmatizer(lookups=lookups)
return lemmatizer_factory
class Language:
"""A text-processing pipeline. Usually you'll load this once per process,
and pass the instance around your application.
@ -128,7 +111,6 @@ class Language:
max_length: int = 10 ** 6,
meta: Dict[str, Any] = {},
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
create_lemmatizer: Optional[Callable[["Language"], Callable]] = None,
**kwargs,
) -> None:
"""Initialise a Language object.
@ -146,8 +128,6 @@ class Language:
100,000 characters in one text.
create_tokenizer (Callable): Function that takes the nlp object and
returns a tokenizer.
create_lemmatizer (Callable): Function that takes the nlp object and
returns a lemmatizer.
DOCS: https://spacy.io/api/language#init
"""
@ -166,13 +146,9 @@ class Language:
if vocab is True:
vectors_name = meta.get("vectors", {}).get("name")
if not create_lemmatizer:
lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]}
create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
vocab = create_vocab(
self.lang,
self.Defaults,
lemmatizer=create_lemmatizer(self),
vectors_name=vectors_name,
load_data=self._config["nlp"]["load_vocab_data"],
)
@ -1451,7 +1427,6 @@ class Language:
filled["components"] = orig_pipeline
config["components"] = orig_pipeline
create_tokenizer = resolved["nlp"]["tokenizer"]
create_lemmatizer = resolved["nlp"]["lemmatizer"]
before_creation = resolved["nlp"]["before_creation"]
after_creation = resolved["nlp"]["after_creation"]
after_pipeline_creation = resolved["nlp"]["after_pipeline_creation"]
@ -1467,7 +1442,6 @@ class Language:
nlp = lang_cls(
vocab=vocab,
create_tokenizer=create_tokenizer,
create_lemmatizer=create_lemmatizer,
)
if after_creation is not None:
nlp = after_creation(nlp)

View File

@ -1,145 +0,0 @@
from typing import Optional, Callable, List, Dict
from .lookups import Lookups
from .parts_of_speech import NAMES as UPOS_NAMES
class Lemmatizer:
"""
The Lemmatizer supports simple part-of-speech-sensitive suffix rules and
lookup tables.
DOCS: https://spacy.io/api/lemmatizer
"""
def __init__(
self,
lookups: Optional[Lookups] = None,
is_base_form: Optional[Callable] = None,
) -> None:
"""Initialize a Lemmatizer.
lookups (Lookups): The lookups object containing the (optional) tables
"lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
"""
self.lookups = lookups if lookups is not None else Lookups()
self.is_base_form = is_base_form
def __call__(
self, string: str, univ_pos: str, morphology: Optional[dict] = None
) -> List[str]:
"""Lemmatize a string.
string (str): The string to lemmatize, e.g. the token text.
univ_pos (str / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
RETURNS (list): The available lemmas for the string.
"""
lookup_table = self.lookups.get_table("lemma_lookup", {})
if "lemma_rules" not in self.lookups:
return [lookup_table.get(string, string)]
if isinstance(univ_pos, int):
univ_pos = UPOS_NAMES.get(univ_pos, "X")
univ_pos = univ_pos.lower()
if univ_pos in ("", "eol", "space"):
return [string.lower()]
# See Issue #435 for example of where this logic is requied.
if callable(self.is_base_form) and self.is_base_form(univ_pos, morphology):
return [string.lower()]
index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {})
if not any(
(
index_table.get(univ_pos),
exc_table.get(univ_pos),
rules_table.get(univ_pos),
)
):
if univ_pos == "propn":
return [string]
else:
return [string.lower()]
lemmas = self.lemmatize(
string,
index_table.get(univ_pos, {}),
exc_table.get(univ_pos, {}),
rules_table.get(univ_pos, []),
)
return lemmas
def noun(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "noun", morphology)
def verb(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "verb", morphology)
def adj(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "adj", morphology)
def det(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "det", morphology)
def pron(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "pron", morphology)
def adp(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "adp", morphology)
def num(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "num", morphology)
def punct(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "punct", morphology)
def lookup(self, string: str, orth: Optional[int] = None) -> str:
"""Look up a lemma in the table, if available. If no lemma is found,
the original string is returned.
string (str): The original string.
orth (int): Optional hash of the string to look up. If not set, the
string will be used and hashed.
RETURNS (str): The lemma if the string was found, otherwise the
original string.
"""
lookup_table = self.lookups.get_table("lemma_lookup", {})
key = orth if orth is not None else string
if key in lookup_table:
return lookup_table[key]
return string
def lemmatize(
self,
string: str,
index: Dict[str, List[str]],
exceptions: Dict[str, Dict[str, List[str]]],
rules: Dict[str, List[List[str]]],
) -> List[str]:
orig = string
string = string.lower()
forms = []
oov_forms = []
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index or not form.isalpha():
forms.append(form)
else:
oov_forms.append(form)
# Remove duplicates but preserve the ordering of applied "rules"
forms = list(dict.fromkeys(forms))
# Put exceptions at the front of the list, so they get priority.
# This is a dodgy heuristic -- but it's the best we can do until we get
# frequencies on this. We can at least prune out problematic exceptions,
# if they shadow more frequent analyses.
for form in exceptions.get(string, []):
if form not in forms:
forms.insert(0, form)
if not forms:
forms.extend(oov_forms)
if not forms:
forms.append(orig)
return forms

View File

@ -28,6 +28,8 @@ def load_lookups(
# TODO: import spacy_lookups_data instead of going via entry points here?
lookups = Lookups()
if lang not in registry.lookups:
if strict and len(tables) > 0:
raise ValueError(Errors.E955.format(table=", ".join(tables), lang=lang))
return lookups
data = registry.lookups.get(lang)
for table in tables:
@ -41,152 +43,6 @@ def load_lookups(
return lookups
class Lookups:
"""Container for large lookup tables and dictionaries, e.g. lemmatization
data or tokenizer exception lists. Lookups are available via vocab.lookups,
so they can be accessed before the pipeline components are applied (e.g.
in the tokenizer and lemmatizer), as well as within the pipeline components
via doc.vocab.lookups.
"""
def __init__(self) -> None:
"""Initialize the Lookups object.
DOCS: https://spacy.io/api/lookups#init
"""
self._tables = {}
def __contains__(self, name: str) -> bool:
"""Check if the lookups contain a table of a given name. Delegates to
Lookups.has_table.
name (str): Name of the table.
RETURNS (bool): Whether a table of that name is in the lookups.
"""
return self.has_table(name)
def __len__(self) -> int:
"""RETURNS (int): The number of tables in the lookups."""
return len(self._tables)
@property
def tables(self) -> List[str]:
"""RETURNS (List[str]): Names of all tables in the lookups."""
return list(self._tables.keys())
def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> "Table":
"""Add a new table to the lookups. Raises an error if the table exists.
name (str): Unique name of table.
data (dict): Optional data to add to the table.
RETURNS (Table): The newly added table.
DOCS: https://spacy.io/api/lookups#add_table
"""
if name in self.tables:
raise ValueError(Errors.E158.format(name=name))
table = Table(name=name, data=data)
self._tables[name] = table
return table
def get_table(self, name: str, default: Any = UNSET) -> "Table":
"""Get a table. Raises an error if the table doesn't exist and no
default value is provided.
name (str): Name of the table.
default (Any): Optional default value to return if table doesn't exist.
RETURNS (Table): The table.
DOCS: https://spacy.io/api/lookups#get_table
"""
if name not in self._tables:
if default == UNSET:
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
return default
return self._tables[name]
def remove_table(self, name: str) -> "Table":
"""Remove a table. Raises an error if the table doesn't exist.
name (str): Name of the table to remove.
RETURNS (Table): The removed table.
DOCS: https://spacy.io/api/lookups#remove_table
"""
if name not in self._tables:
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
return self._tables.pop(name)
def has_table(self, name: str) -> bool:
"""Check if the lookups contain a table of a given name.
name (str): Name of the table.
RETURNS (bool): Whether a table of that name exists.
DOCS: https://spacy.io/api/lookups#has_table
"""
return name in self._tables
def to_bytes(self, **kwargs) -> bytes:
"""Serialize the lookups to a bytestring.
RETURNS (bytes): The serialized Lookups.
DOCS: https://spacy.io/api/lookups#to_bytes
"""
return srsly.msgpack_dumps(self._tables)
def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups":
"""Load the lookups from a bytestring.
bytes_data (bytes): The data to load.
RETURNS (Lookups): The loaded Lookups.
DOCS: https://spacy.io/api/lookups#from_bytes
"""
self._tables = {}
for key, value in srsly.msgpack_loads(bytes_data).items():
self._tables[key] = Table(key, value)
return self
def to_disk(
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
) -> None:
"""Save the lookups to a directory as lookups.bin. Expects a path to a
directory, which will be created if it doesn't exist.
path (str / Path): The file path.
DOCS: https://spacy.io/api/lookups#to_disk
"""
if len(self._tables):
path = ensure_path(path)
if not path.exists():
path.mkdir()
filepath = path / filename
with filepath.open("wb") as file_:
file_.write(self.to_bytes())
def from_disk(
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
) -> "Lookups":
"""Load lookups from a directory containing a lookups.bin. Will skip
loading if the file doesn't exist.
path (str / Path): The directory path.
RETURNS (Lookups): The loaded lookups.
DOCS: https://spacy.io/api/lookups#from_disk
"""
path = ensure_path(path)
filepath = path / filename
if filepath.exists():
with filepath.open("rb") as file_:
data = file_.read()
return self.from_bytes(data)
return self
class Table(OrderedDict):
"""A table in the lookups. Subclass of builtin dict that implements a
slightly more consistent and unified API.
@ -303,3 +159,159 @@ class Table(OrderedDict):
self.clear()
self.update(data)
return self
class Lookups:
"""Container for large lookup tables and dictionaries, e.g. lemmatization
data or tokenizer exception lists. Lookups are available via vocab.lookups,
so they can be accessed before the pipeline components are applied (e.g.
in the tokenizer and lemmatizer), as well as within the pipeline components
via doc.vocab.lookups.
"""
def __init__(self) -> None:
"""Initialize the Lookups object.
DOCS: https://spacy.io/api/lookups#init
"""
self._tables = {}
def __contains__(self, name: str) -> bool:
"""Check if the lookups contain a table of a given name. Delegates to
Lookups.has_table.
name (str): Name of the table.
RETURNS (bool): Whether a table of that name is in the lookups.
"""
return self.has_table(name)
def __len__(self) -> int:
"""RETURNS (int): The number of tables in the lookups."""
return len(self._tables)
@property
def tables(self) -> List[str]:
"""RETURNS (List[str]): Names of all tables in the lookups."""
return list(self._tables.keys())
def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> Table:
"""Add a new table to the lookups. Raises an error if the table exists.
name (str): Unique name of table.
data (dict): Optional data to add to the table.
RETURNS (Table): The newly added table.
DOCS: https://spacy.io/api/lookups#add_table
"""
if name in self.tables:
raise ValueError(Errors.E158.format(name=name))
table = Table(name=name, data=data)
self._tables[name] = table
return table
def set_table(self, name: str, table: Table) -> None:
"""Set a table.
name (str): Name of the table to set.
table (Table): The Table to set.
DOCS: https://spacy.io/api/lookups#set_table
"""
self._tables[name] = table
def get_table(self, name: str, default: Any = UNSET) -> Table:
"""Get a table. Raises an error if the table doesn't exist and no
default value is provided.
name (str): Name of the table.
default (Any): Optional default value to return if table doesn't exist.
RETURNS (Table): The table.
DOCS: https://spacy.io/api/lookups#get_table
"""
if name not in self._tables:
if default == UNSET:
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
return default
return self._tables[name]
def remove_table(self, name: str) -> Table:
"""Remove a table. Raises an error if the table doesn't exist.
name (str): Name of the table to remove.
RETURNS (Table): The removed table.
DOCS: https://spacy.io/api/lookups#remove_table
"""
if name not in self._tables:
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
return self._tables.pop(name)
def has_table(self, name: str) -> bool:
"""Check if the lookups contain a table of a given name.
name (str): Name of the table.
RETURNS (bool): Whether a table of that name exists.
DOCS: https://spacy.io/api/lookups#has_table
"""
return name in self._tables
def to_bytes(self, **kwargs) -> bytes:
"""Serialize the lookups to a bytestring.
RETURNS (bytes): The serialized Lookups.
DOCS: https://spacy.io/api/lookups#to_bytes
"""
return srsly.msgpack_dumps(self._tables)
def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups":
"""Load the lookups from a bytestring.
bytes_data (bytes): The data to load.
RETURNS (Lookups): The loaded Lookups.
DOCS: https://spacy.io/api/lookups#from_bytes
"""
self._tables = {}
for key, value in srsly.msgpack_loads(bytes_data).items():
self._tables[key] = Table(key, value)
return self
def to_disk(
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
) -> None:
"""Save the lookups to a directory as lookups.bin. Expects a path to a
directory, which will be created if it doesn't exist.
path (str / Path): The file path.
DOCS: https://spacy.io/api/lookups#to_disk
"""
if len(self._tables):
path = ensure_path(path)
if not path.exists():
path.mkdir()
filepath = path / filename
with filepath.open("wb") as file_:
file_.write(self.to_bytes())
def from_disk(
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
) -> "Lookups":
"""Load lookups from a directory containing a lookups.bin. Will skip
loading if the file doesn't exist.
path (str / Path): The directory path.
RETURNS (Lookups): The loaded lookups.
DOCS: https://spacy.io/api/lookups#from_disk
"""
path = ensure_path(path)
filepath = path / filename
if filepath.exists():
with filepath.open("rb") as file_:
data = file_.read()
return self.from_bytes(data)
return self

View File

@ -1,20 +1,73 @@
from typing import Optional
from typing import Optional, List
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
from thinc.types import Floats2d
from ...util import registry
from .._precomputable_affine import PrecomputableAffine
from ..tb_framework import TransitionModel
from ...tokens import Doc
@registry.architectures.register("spacy.TransitionBasedParser.v1")
def build_tb_parser_model(
tok2vec: Model,
tok2vec: Model[List[Doc], List[Floats2d]],
nr_feature_tokens: int,
hidden_width: int,
maxout_pieces: int,
use_upper: bool = True,
nO: Optional[int] = None,
) -> Model:
"""
Build a transition-based parser model. Can apply to NER or dependency-parsing.
Transition-based parsing is an approach to structured prediction where the
task of predicting the structure is mapped to a series of state transitions.
You might find this tutorial helpful as background:
https://explosion.ai/blog/parsing-english-in-python
The neural network state prediction model consists of either two or three
subnetworks:
* tok2vec: Map each token into a vector representations. This subnetwork
is run once for each batch.
* lower: Construct a feature-specific vector for each (token, feature) pair.
This is also run once for each batch. Constructing the state
representation is then simply a matter of summing the component features
and applying the non-linearity.
* upper (optional): A feed-forward network that predicts scores from the
state representation. If not present, the output from the lower model is
used as action scores directly.
tok2vec (Model[List[Doc], List[Floats2d]]):
Subnetwork to map tokens into vector representations.
nr_feature_tokens (int): The number of tokens in the context to use to
construct the state vector. Valid choices are 1, 2, 3, 6, 8 and 13. The
2, 8 and 13 feature sets are designed for the parser, while the 3 and 6
feature sets are designed for the NER. The recommended feature sets are
3 for NER, and 8 for the dependency parser.
TODO: This feature should be split into two, state_type: ["deps", "ner"]
and extra_state_features: [True, False]. This would map into:
(deps, False): 8
(deps, True): 13
(ner, False): 3
(ner, True): 6
hidden_width (int): The width of the hidden layer.
maxout_pieces (int): How many pieces to use in the state prediction layer.
Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
is replaced with a ReLu non-linearity if use_upper=True, and no
non-linearity if use_upper=False.
use_upper (bool): Whether to use an additional hidden layer after the state
vector in order to predict the action scores. It is recommended to set
this to False for large pretrained models such as transformers, and False
for smaller networks. The upper layer is computed on CPU, which becomes
a bottleneck on larger GPU-based models, where it's also less necessary.
nO (int or None): The number of actions the model will predict between.
Usually inferred from data at the beginning of training, or loaded from
disk.
"""
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),)
tok2vec.set_dim("nO", hidden_width)

View File

@ -10,10 +10,24 @@ from .._iob import IOB
from ...util import registry
@registry.architectures.register("spacy.BiluoTagger.v1")
@registry.architectures.register("spacy.BILUOTagger.v1")
def BiluoTagger(
tok2vec: Model[List[Doc], List[Floats2d]]
) -> Model[List[Doc], List[Floats2d]]:
"""Construct a simple NER tagger, that predicts BILUO tag scores for each
token and uses greedy decoding with transition-constraints to return a valid
BILUO tag sequence.
A BILUO tag sequence encodes a sequence of non-overlapping labelled spans
into tags assigned to each token. The first token of a span is given the
tag B-LABEL, the last token of the span is given the tag L-LABEL, and tokens
within the span are given the tag U-LABEL. Single-token spans are given
the tag U-LABEL. All other tokens are assigned the tag O.
The BILUO tag scheme generally results in better linear separation between
classes, especially for non-CRF models, because there are more distinct classes
for the different situations (Ratinov et al., 2009).
"""
biluo = BILUO()
linear = Linear(
nO=None, nI=tok2vec.get_dim("nO"), init_W=configure_normal_init(mean=0.02)
@ -41,6 +55,15 @@ def BiluoTagger(
def IOBTagger(
tok2vec: Model[List[Doc], List[Floats2d]]
) -> Model[List[Doc], List[Floats2d]]:
"""Construct a simple NER tagger, that predicts IOB tag scores for each
token and uses greedy decoding with transition-constraints to return a valid
IOB tag sequence.
An IOB tag sequence encodes a sequence of non-overlapping labelled spans
into tags assigned to each token. The first token of a span is given the
tag B-LABEL, and subsequent tokens are given the tag I-LABEL.
All other tokens are assigned the tag O.
"""
biluo = IOB()
linear = Linear(nO=None, nI=tok2vec.get_dim("nO"))
model = chain(

View File

@ -1,11 +1,22 @@
from typing import Optional
from typing import Optional, List
from thinc.api import zero_init, with_array, Softmax, chain, Model
from thinc.types import Floats2d
from ...util import registry
from ...tokens import Doc
@registry.architectures.register("spacy.Tagger.v1")
def build_tagger_model(tok2vec: Model, nO: Optional[int] = None) -> Model:
def build_tagger_model(
tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None
) -> Model[List[Doc], List[Floats2d]]:
"""Build a tagger model, using a provided token-to-vector component. The tagger
model simply adds a linear layer with softmax activation to predict scores
given the token vectors.
tok2vec (Model[List[Doc], List[Floats2d]]): The token-to-vector subnetwork.
nO (int or None): The number of tags to output. Inferred from the data if None.
"""
# TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
output_layer = Softmax(nO, t2v_width, init_W=zero_init)

View File

@ -45,6 +45,7 @@ def build_bow_text_classifier(
no_output_layer: bool,
nO: Optional[int] = None,
) -> Model:
# Don't document this yet, I'm not sure it's right.
with Model.define_operators({">>": chain}):
sparse_linear = SparseLinear(nO)
model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
@ -69,6 +70,7 @@ def build_text_classifier(
dropout: Optional[float],
nO: Optional[int] = None,
) -> Model:
# Don't document this yet, I'm not sure it's right.
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
lower = HashEmbed(
@ -160,6 +162,7 @@ def build_text_classifier_lowdata(
dropout: Optional[float],
nO: Optional[int] = None,
) -> Model:
# Don't document this yet, I'm not sure it's right.
# Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
with Model.define_operators({">>": chain, "**": clone}):
model = (

View File

@ -28,11 +28,31 @@ def build_hash_embed_cnn_tok2vec(
window_size: int,
maxout_pieces: int,
subword_features: bool,
dropout: Optional[float],
pretrained_vectors: Optional[bool]
) -> Model[List[Doc], List[Floats2d]]:
"""Build spaCy's 'standard' tok2vec layer, which uses hash embedding
with subword features and a CNN with layer-normalized maxout."""
with subword features and a CNN with layer-normalized maxout.
width (int): The width of the input and output. These are required to be the
same, so that residual connections can be used. Recommended values are
96, 128 or 300.
depth (int): The number of convolutional layers to use. Recommended values
are between 2 and 8.
window_size (int): The number of tokens on either side to concatenate during
the convolutions. The receptive field of the CNN will be
depth * (window_size * 2 + 1), so a 4-layer network with window_size of
2 will be sensitive to 17 words at a time. Recommended value is 1.
embed_size (int): The number of rows in the hash embedding tables. This can
be surprisingly small, due to the use of the hash embeddings. Recommended
values are between 2000 and 10000.
maxout_pieces (int): The number of pieces to use in the maxout non-linearity.
If 1, the Mish non-linearity is used instead. Recommended values are 1-3.
subword_features (bool): Whether to also embed subword features, specifically
the prefix, suffix and word shape. This is recommended for alphabetic
languages like English, but not if single-character tokens are used for
a language such as Chinese.
pretrained_vectors (bool): Whether to also use static vectors.
"""
return build_Tok2Vec_model(
embed=MultiHashEmbed(
width=width,
@ -54,7 +74,14 @@ def build_Tok2Vec_model(
embed: Model[List[Doc], List[Floats2d]],
encode: Model[List[Floats2d], List[Floats2d]],
) -> Model[List[Doc], List[Floats2d]]:
"""Construct a tok2vec model out of embedding and encoding subnetworks.
See https://explosion.ai/blog/deep-learning-formula-nlp
embed (Model[List[Doc], List[Floats2d]]): Embed tokens into context-independent
word vector representations.
encode (Model[List[Floats2d], List[Floats2d]]): Encode context into the
embeddings, using an architecture such as a CNN, BiLSTM or transformer.
"""
receptive_field = encode.attrs.get("receptive_field", 0)
tok2vec = chain(embed, with_array(encode, pad=receptive_field))
tok2vec.set_dim("nO", encode.get_dim("nO"))
@ -67,6 +94,27 @@ def build_Tok2Vec_model(
def MultiHashEmbed(
width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
):
"""Construct an embedding layer that separately embeds a number of lexical
attributes using hash embedding, concatenates the results, and passes it
through a feed-forward subnetwork to build a mixed representations.
The features used are the NORM, PREFIX, SUFFIX and SHAPE, which can have
varying definitions depending on the Vocab of the Doc object passed in.
Vectors from pretrained static vectors can also be incorporated into the
concatenated representation.
width (int): The output width. Also used as the width of the embedding tables.
Recommended values are between 64 and 300.
rows (int): The number of rows for the embedding tables. Can be low, due
to the hashing trick. Embeddings for prefix, suffix and word shape
use half as many rows. Recommended values are between 2000 and 10000.
also_embed_subwords (bool): Whether to use the PREFIX, SUFFIX and SHAPE
features in the embeddings. If not using these, you may need more
rows in your hash embeddings, as there will be increased chance of
collisions.
also_use_static_vectors (bool): Whether to also use static word vectors.
Requires a vectors table to be loaded in the Doc objects' vocab.
"""
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
seed = 7
@ -117,6 +165,30 @@ def MultiHashEmbed(
@registry.architectures.register("spacy.CharacterEmbed.v1")
def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
"""Construct an embedded representations based on character embeddings, using
a feed-forward network. A fixed number of UTF-8 byte characters are used for
each word, taken from the beginning and end of the word equally. Padding is
used in the centre for words that are too short.
For instance, let's say nC=4, and the word is "jumping". The characters
used will be jung (two from the start, two from the end). If we had nC=8,
the characters would be "jumpping": 4 from the start, 4 from the end. This
ensures that the final character is always in the last position, instead
of being in an arbitrary position depending on the word length.
The characters are embedded in a embedding table with 256 rows, and the
vectors concatenated. A hash-embedded vector of the NORM of the word is
also concatenated on, and the result is then passed through a feed-forward
network to construct a single vector to represent the information.
width (int): The width of the output vector and the NORM hash embedding.
rows (int): The number of rows in the NORM hash embedding table.
nM (int): The dimensionality of the character embeddings. Recommended values
are between 16 and 64.
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
are between 3 and 8, although it may depend on the length of words in the
language.
"""
model = chain(
concatenate(
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
@ -133,7 +205,21 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth: int):
def MaxoutWindowEncoder(
width: int, window_size: int, maxout_pieces: int, depth: int
) -> Model[List[Floats2d], List[Floats2d]]:
"""Encode context using convolutions with maxout activation, layer
normalization and residual connections.
width (int): The input and output width. These are required to be the same,
to allow residual connections. This value will be determined by the
width of the inputs. Recommended values are between 64 and 300.
window_size (int): The number of words to concatenate around each token
to construct the convolution. Recommended value is 1.
maxout_pieces (int): The number of maxout pieces to use. Recommended
values are 2 or 3.
depth (int): The number of convolutional layers. Recommended value is 4.
"""
cnn = chain(
expand_window(window_size=window_size),
Maxout(
@ -151,7 +237,19 @@ def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth:
@registry.architectures.register("spacy.MishWindowEncoder.v1")
def MishWindowEncoder(width, window_size, depth):
def MishWindowEncoder(
width: int, window_size: int, depth: int
) -> Model[List[Floats2d], List[Floats2d]]:
"""Encode context using convolutions with mish activation, layer
normalization and residual connections.
width (int): The input and output width. These are required to be the same,
to allow residual connections. This value will be determined by the
width of the inputs. Recommended values are between 64 and 300.
window_size (int): The number of words to concatenate around each token
to construct the convolution. Recommended value is 1.
depth (int): The number of convolutional layers. Recommended value is 4.
"""
cnn = chain(
expand_window(window_size=window_size),
Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
@ -162,7 +260,18 @@ def MishWindowEncoder(width, window_size, depth):
@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
def BiLSTMEncoder(width, depth, dropout):
def BiLSTMEncoder(
width: int, depth: int, dropout: float
) -> Model[List[Floats2d], List[Floats2d]]:
"""Encode context using bidirectonal LSTM layers. Requires PyTorch.
width (int): The input and output width. These are required to be the same,
to allow residual connections. This value will be determined by the
width of the inputs. Recommended values are between 64 and 300.
window_size (int): The number of words to concatenate around each token
to construct the convolution. Recommended value is 1.
depth (int): The number of convolutional layers. Recommended value is 4.
"""
if depth == 0:
return noop()
return with_padded(PyTorchLSTM(width, width, bi=True, depth=depth, dropout=dropout))

View File

@ -27,12 +27,6 @@ cdef class Morphology:
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
cdef int insert(self, MorphAnalysisC tag) except -1
cdef int assign_untagged(self, TokenC* token) except -1
cdef int assign_tag(self, TokenC* token, tag) except -1
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1
cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
cdef list list_features(const MorphAnalysisC* morph)

View File

@ -31,43 +31,15 @@ cdef class Morphology:
VALUE_SEP = ","
EMPTY_MORPH = "_" # not an empty string so that the PreshMap key is not 0
def __init__(self, StringStore strings, tag_map, lemmatizer, exc=None):
def __init__(self, StringStore strings):
self.mem = Pool()
self.strings = strings
self.tags = PreshMap()
self.load_tag_map(tag_map)
self.lemmatizer = lemmatizer
self._cache = PreshMapArray(self.n_tags)
self._exc = {}
if exc is not None:
self.load_morph_exceptions(exc)
def load_tag_map(self, tag_map):
self.tag_map = {}
self.reverse_index = {}
# Add special space symbol. We prefix with underscore, to make sure it
# always sorts to the end.
if '_SP' in tag_map:
space_attrs = tag_map.get('_SP')
else:
space_attrs = tag_map.get('SP', {POS: SPACE})
if '_SP' not in tag_map:
self.strings.add('_SP')
tag_map = dict(tag_map)
tag_map['_SP'] = space_attrs
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
attrs = self.normalize_attrs(attrs)
self.add(attrs)
self.tag_map[tag_str] = dict(attrs)
self.reverse_index[self.strings.add(tag_str)] = i
self.tag_names = tuple(sorted(self.tag_map.keys()))
self.n_tags = len(self.tag_map)
self._cache = PreshMapArray(self.n_tags)
def __reduce__(self):
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
self.exc), None, None)
tags = set([self.get(self.strings[s]) for s in self.strings])
tags -= set([""])
return (unpickle_morphology, (self.strings, sorted(tags)), None, None)
def add(self, features):
"""Insert a morphological analysis in the morphology table, if not
@ -185,115 +157,6 @@ cdef class Morphology:
else:
return self.strings[tag.key]
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
if orth not in self.strings:
return orth
cdef unicode py_string = self.strings[orth]
if self.lemmatizer is None:
return self.strings.add(py_string.lower())
cdef list lemma_strings
cdef unicode lemma_string
# Normalize features into a dict keyed by the field, to make life easier
# for the lemmatizer. Handles string-to-int conversion too.
string_feats = {}
for key, value in morphology.items():
if value is True:
name, value = self.strings.as_string(key).split('_', 1)
string_feats[name] = value
else:
string_feats[self.strings.as_string(key)] = self.strings.as_string(value)
lemma_strings = self.lemmatizer(py_string, univ_pos, string_feats)
lemma_string = lemma_strings[0]
lemma = self.strings.add(lemma_string)
return lemma
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
force=False):
"""Add a special-case rule to the morphological analyser. Tokens whose
tag and orth match the rule will receive the specified properties.
tag (str): The part-of-speech tag to key the exception.
orth (str): The word-form to key the exception.
"""
attrs = dict(attrs)
attrs = self.normalize_attrs(attrs)
self.add(attrs)
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
self._exc[(tag_str, self.strings.add(orth_str))] = attrs
cdef int assign_untagged(self, TokenC* token) except -1:
"""Set morphological attributes on a token without a POS tag. Uses
the lemmatizer's lookup() method, which looks up the string in the
table provided by the language data as lemma_lookup (if available).
"""
if token.lemma == 0:
orth_str = self.strings[token.lex.orth]
lemma = self.lemmatizer.lookup(orth_str, orth=token.lex.orth)
token.lemma = self.strings.add(lemma)
cdef int assign_tag(self, TokenC* token, tag_str) except -1:
cdef attr_t tag = self.strings.as_int(tag_str)
if tag in self.reverse_index:
tag_id = self.reverse_index[tag]
self.assign_tag_id(token, tag_id)
else:
token.tag = tag
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
if tag_id > self.n_tags:
raise ValueError(Errors.E014.format(tag=tag_id))
# Ensure spaces get tagged as space.
# It seems pretty arbitrary to put this logic here, but there's really
# nowhere better. I guess the justification is that this is where the
# specific word and the tag interact. Still, we should have a better
# way to enforce this rule, or figure out why the statistical model fails.
# Related to Issue #220
if Lexeme.c_check_flag(token.lex, IS_SPACE):
tag_id = self.reverse_index[self.strings.add('_SP')]
tag_str = self.tag_names[tag_id]
features = dict(self.tag_map.get(tag_str, {}))
if features:
pos = self.strings.as_int(features.pop(POS))
else:
pos = 0
cdef attr_t lemma = <attr_t>self._cache.get(tag_id, token.lex.orth)
if lemma == 0:
# Ugh, self.lemmatize has opposite arg order from self.lemmatizer :(
lemma = self.lemmatize(pos, token.lex.orth, features)
self._cache.set(tag_id, token.lex.orth, <void*>lemma)
token.lemma = lemma
token.pos = <univ_pos_t>pos
token.tag = self.strings[tag_str]
token.morph = self.add(features)
if (self.tag_names[tag_id], token.lex.orth) in self._exc:
self._assign_tag_from_exceptions(token, tag_id)
cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1:
key = (self.tag_names[tag_id], token.lex.orth)
cdef dict attrs
attrs = self._exc[key]
token.pos = attrs.get(POS, token.pos)
token.lemma = attrs.get(LEMMA, token.lemma)
def load_morph_exceptions(self, dict morph_rules):
self._exc = {}
# Map (form, pos) to attributes
for tag, exc in morph_rules.items():
for orth, attrs in exc.items():
attrs = self.normalize_attrs(attrs)
self.add_special_case(self.strings.as_string(tag), self.strings.as_string(orth), attrs)
@property
def exc(self):
# generate the serializable exc in the MORPH_RULES format from the
# internal tuple-key format
morph_rules = {}
for (tag, orth) in sorted(self._exc):
if not tag in morph_rules:
morph_rules[tag] = {}
morph_rules[tag][self.strings[orth]] = self._exc[(tag, orth)]
return morph_rules
@staticmethod
def feats_to_dict(feats):
if not feats or feats == Morphology.EMPTY_MORPH:
@ -338,3 +201,9 @@ cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t fie
results[n_results] = morph.features[i]
n_results += 1
return n_results
def unpickle_morphology(strings, tags):
cdef Morphology morphology = Morphology(strings)
for tag in tags:
morphology.add(tag)
return morphology

View File

@ -3,9 +3,10 @@ from .dep_parser import DependencyParser
from .entity_linker import EntityLinker
from .ner import EntityRecognizer
from .entityruler import EntityRuler
from .lemmatizer import Lemmatizer
from .morphologizer import Morphologizer
from .pipe import Pipe
from spacy.pipeline.senter import SentenceRecognizer
from .senter import SentenceRecognizer
from .sentencizer import Sentencizer
from .simple_ner import SimpleNER
from .tagger import Tagger
@ -20,6 +21,7 @@ __all__ = [
"EntityRecognizer",
"EntityRuler",
"Morphologizer",
"Lemmatizer",
"Pipe",
"SentenceRecognizer",
"Sentencizer",

View File

@ -17,13 +17,18 @@ MatcherPatternType = List[Dict[Union[int, str], Any]]
AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]]
@Language.factory("attribute_ruler")
@Language.factory(
"attribute_ruler", default_config={"pattern_dicts": None, "validate": False}
)
def make_attribute_ruler(
nlp: Language,
name: str,
pattern_dicts: Optional[Iterable[AttributeRulerPatternType]] = None,
pattern_dicts: Optional[Iterable[AttributeRulerPatternType]],
validate: bool,
):
return AttributeRuler(nlp.vocab, name, pattern_dicts=pattern_dicts)
return AttributeRuler(
nlp.vocab, name, pattern_dicts=pattern_dicts, validate=validate
)
class AttributeRuler(Pipe):
@ -39,6 +44,7 @@ class AttributeRuler(Pipe):
name: str = "attribute_ruler",
*,
pattern_dicts: Optional[Iterable[AttributeRulerPatternType]] = None,
validate: bool = False,
) -> None:
"""Initialize the AttributeRuler.
@ -54,7 +60,7 @@ class AttributeRuler(Pipe):
"""
self.name = name
self.vocab = vocab
self.matcher = Matcher(self.vocab)
self.matcher = Matcher(self.vocab, validate=validate)
self.attrs = []
self._attrs_unnormed = [] # store for reference
self.indices = []
@ -63,7 +69,7 @@ class AttributeRuler(Pipe):
self.add_patterns(pattern_dicts)
def __call__(self, doc: Doc) -> Doc:
"""Apply the attributeruler to a Doc and set all attribute exceptions.
"""Apply the AttributeRuler to a Doc and set all attribute exceptions.
doc (Doc): The document to process.
RETURNS (Doc): The processed Doc.
@ -89,9 +95,31 @@ class AttributeRuler(Pipe):
set_token_attrs(token, attrs)
return doc
def pipe(self, stream, *, batch_size=128):
"""Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are
applied to the Doc.
stream (Iterable[Doc]): A stream of documents.
batch_size (int): The number of documents to buffer.
YIELDS (Doc): Processed documents in order.
DOCS: https://spacy.io/attributeruler/pipe#pipe
"""
for doc in stream:
doc = self(doc)
yield doc
def load_from_tag_map(
self, tag_map: Dict[str, Dict[Union[int, str], Union[int, str]]]
) -> None:
"""Load attribute ruler patterns from a tag map.
tag_map (dict): The tag map that maps fine-grained tags to
coarse-grained tags and morphological features.
DOCS: https://spacy.io/api/attributeruler#load_from_morph_rules
"""
for tag, attrs in tag_map.items():
pattern = [{"TAG": tag}]
attrs, morph_attrs = _split_morph_attrs(attrs)
@ -102,6 +130,14 @@ class AttributeRuler(Pipe):
def load_from_morph_rules(
self, morph_rules: Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
) -> None:
"""Load attribute ruler patterns from morph rules.
morph_rules (dict): The morph rules that map token text and
fine-grained tags to coarse-grained tags, lemmas and morphological
features.
DOCS: https://spacy.io/api/attributeruler#load_from_morph_rules
"""
for tag in morph_rules:
for word in morph_rules[tag]:
pattern = [{"ORTH": word, "TAG": tag}]
@ -133,11 +169,20 @@ class AttributeRuler(Pipe):
self.indices.append(index)
def add_patterns(self, pattern_dicts: Iterable[AttributeRulerPatternType]) -> None:
"""Add patterns from a list of pattern dicts with the keys as the
arguments to AttributeRuler.add.
pattern_dicts (Iterable[dict]): A list of pattern dicts with the keys
as the arguments to AttributeRuler.add (patterns/attrs/index) to
add as patterns.
DOCS: https://spacy.io/api/attributeruler#add_patterns
"""
for p in pattern_dicts:
self.add(**p)
@property
def patterns(self) -> List[AttributeRulerPatternType]:
"""All the added patterns."""
all_patterns = []
for i in range(len(self.attrs)):
p = {}
@ -148,7 +193,7 @@ class AttributeRuler(Pipe):
return all_patterns
def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes:
"""Serialize the attributeruler to a bytestring.
"""Serialize the AttributeRuler to a bytestring.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (bytes): The serialized object.
@ -164,7 +209,7 @@ class AttributeRuler(Pipe):
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data: bytes, exclude: Iterable[str] = tuple()):
"""Load the attributeruler from a bytestring.
"""Load the AttributeRuler from a bytestring.
bytes_data (bytes): The data to load.
exclude (Iterable[str]): String names of serialization fields to exclude.
@ -200,7 +245,7 @@ class AttributeRuler(Pipe):
return self
def to_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()) -> None:
"""Serialize the attributeruler to disk.
"""Serialize the AttributeRuler to disk.
path (Union[Path, str]): A path to a directory.
exclude (Iterable[str]): String names of serialization fields to exclude.
@ -218,7 +263,7 @@ class AttributeRuler(Pipe):
def from_disk(
self, path: Union[Path, str], exclude: Iterable[str] = tuple()
) -> None:
"""Load the attributeruler from disk.
"""Load the AttributeRuler from disk.
path (Union[Path, str]): A path to a directory.
exclude (Iterable[str]): String names of serialization fields to exclude.

View File

@ -27,7 +27,6 @@ embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
dropout = null
"""
DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]

View File

@ -29,7 +29,6 @@ embed_size = 300
window_size = 1
maxout_pieces = 3
subword_features = true
dropout = null
"""
DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]

View File

@ -20,7 +20,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
default_config={
"phrase_matcher_attr": None,
"validation": False,
"validate": False,
"overwrite_ents": False,
"ent_id_sep": DEFAULT_ENT_ID_SEP,
},
@ -31,7 +31,7 @@ def make_entity_ruler(
nlp: Language,
name: str,
phrase_matcher_attr: Optional[Union[int, str]],
validation: bool,
validate: bool,
overwrite_ents: bool,
ent_id_sep: str,
):
@ -39,7 +39,7 @@ def make_entity_ruler(
nlp,
name,
phrase_matcher_attr=phrase_matcher_attr,
validate=validation,
validate=validate,
overwrite_ents=overwrite_ents,
ent_id_sep=ent_id_sep,
)

View File

@ -0,0 +1,330 @@
from typing import Optional, List, Dict, Any
from thinc.api import Model
from .pipe import Pipe
from ..errors import Errors
from ..language import Language
from ..lookups import Lookups, load_lookups
from ..scorer import Scorer
from ..tokens import Doc, Token
from ..vocab import Vocab
from .. import util
@Language.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={
"model": None,
"mode": "lookup",
"lookups": None,
"overwrite": False,
},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
overwrite: bool = False,
):
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
return Lemmatizer(
nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite
)
class Lemmatizer(Pipe):
"""
The Lemmatizer supports simple part-of-speech-sensitive suffix rules and
lookup tables.
DOCS: https://spacy.io/api/lemmatizer
"""
@classmethod
def get_lookups_config(cls, mode: str) -> Dict:
"""Returns the lookups configuration settings for a given mode for use
in Lemmatizer.load_lookups.
mode (str): The lemmatizer mode.
RETURNS (dict): The lookups configuration settings for this mode.
DOCS: https://spacy.io/api/lemmatizer#get_lookups_config
"""
if mode == "lookup":
return {
"required_tables": ["lemma_lookup"],
}
elif mode == "rule":
return {
"required_tables": ["lemma_rules"],
"optional_tables": ["lemma_exc", "lemma_index"],
}
return {}
@classmethod
def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups],) -> Lookups:
"""Load and validate lookups tables. If the provided lookups is None,
load the default lookups tables according to the language and mode
settings. Confirm that all required tables for the language and mode
are present.
lang (str): The language code.
mode (str): The lemmatizer mode.
lookups (Lookups): The provided lookups, may be None if the default
lookups should be loaded.
RETURNS (Lookups): The Lookups object.
DOCS: https://spacy.io/api/lemmatizer#get_lookups_config
"""
config = cls.get_lookups_config(mode)
required_tables = config.get("required_tables", [])
optional_tables = config.get("optional_tables", [])
if lookups is None:
lookups = load_lookups(lang=lang, tables=required_tables)
optional_lookups = load_lookups(
lang=lang, tables=optional_tables, strict=False
)
for table in optional_lookups.tables:
lookups.set_table(table, optional_lookups.get_table(table))
for table in required_tables:
if table not in lookups:
raise ValueError(
Errors.E1004.format(
mode=mode, tables=required_tables, found=lookups.tables
)
)
return lookups
def __init__(
self,
vocab: Vocab,
model: Optional[Model],
name: str = "lemmatizer",
*,
mode: str = "lookup",
lookups: Optional[Lookups] = None,
overwrite: bool = False,
) -> None:
"""Initialize a Lemmatizer.
vocab (Vocab): The vocab.
model (Model): A model (not yet implemented).
name (str): The component name. Defaults to "lemmatizer".
mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
lookups (Lookups): The lookups object containing the (optional) tables
such as "lemma_rules", "lemma_index", "lemma_exc" and
"lemma_lookup". Defaults to None
overwrite (bool): Whether to overwrite existing lemmas. Defaults to
`False`.
DOCS: https://spacy.io/api/lemmatizer#init
"""
self.vocab = vocab
self.model = model
self._mode = mode
self.lookups = lookups if lookups is not None else Lookups()
self.overwrite = overwrite
if self.mode == "lookup":
self.lemmatize = self.lookup_lemmatize
elif self.mode == "rule":
self.lemmatize = self.rule_lemmatize
else:
try:
self.lemmatize = getattr(self, f"{self.mode}_lemmatize")
except AttributeError:
raise ValueError(Errors.E1003.format(mode=mode))
self.cache = {}
@property
def mode(self):
return self._mode
def __call__(self, doc: Doc) -> Doc:
"""Apply the lemmatizer to one document.
doc (Doc): The Doc to process.
RETURNS (Doc): The processed Doc.
DOCS: https://spacy.io/api/lemmatizer#call
"""
for token in doc:
if self.overwrite or token.lemma == 0:
token.lemma_ = self.lemmatize(token)[0]
return doc
def pipe(self, stream, *, batch_size=128):
"""Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are
applied to the Doc.
stream (Iterable[Doc]): A stream of documents.
batch_size (int): The number of documents to buffer.
YIELDS (Doc): Processed documents in order.
DOCS: https://spacy.io/api/lemmatizer#pipe
"""
for doc in stream:
doc = self(doc)
yield doc
def lookup_lemmatize(self, token: Token) -> List[str]:
"""Lemmatize using a lookup-based approach.
token (Token): The token to lemmatize.
RETURNS (list): The available lemmas for the string.
DOCS: https://spacy.io/api/lemmatizer#lookup_lemmatize
"""
lookup_table = self.lookups.get_table("lemma_lookup", {})
result = lookup_table.get(token.text, token.text)
if isinstance(result, str):
result = [result]
return result
def rule_lemmatize(self, token: Token) -> List[str]:
"""Lemmatize using a rule-based approach.
token (Token): The token to lemmatize.
RETURNS (list): The available lemmas for the string.
DOCS: https://spacy.io/api/lemmatizer#rule_lemmatize
"""
cache_key = (token.orth, token.pos, token.morph)
if cache_key in self.cache:
return self.cache[cache_key]
string = token.text
univ_pos = token.pos_.lower()
if univ_pos in ("", "eol", "space"):
return [string.lower()]
# See Issue #435 for example of where this logic is requied.
if self.is_base_form(token):
return [string.lower()]
index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {})
if not any(
(
index_table.get(univ_pos),
exc_table.get(univ_pos),
rules_table.get(univ_pos),
)
):
if univ_pos == "propn":
return [string]
else:
return [string.lower()]
index = index_table.get(univ_pos, {})
exceptions = exc_table.get(univ_pos, {})
rules = rules_table.get(univ_pos, {})
orig = string
string = string.lower()
forms = []
oov_forms = []
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index or not form.isalpha():
forms.append(form)
else:
oov_forms.append(form)
# Remove duplicates but preserve the ordering of applied "rules"
forms = list(dict.fromkeys(forms))
# Put exceptions at the front of the list, so they get priority.
# This is a dodgy heuristic -- but it's the best we can do until we get
# frequencies on this. We can at least prune out problematic exceptions,
# if they shadow more frequent analyses.
for form in exceptions.get(string, []):
if form not in forms:
forms.insert(0, form)
if not forms:
forms.extend(oov_forms)
if not forms:
forms.append(orig)
self.cache[cache_key] = forms
return forms
def is_base_form(self, token: Token) -> bool:
"""Check whether the token is a base form that does not need further
analysis for lemmatization.
token (Token): The token.
RETURNS (bool): Whether the token is a base form.
DOCS: https://spacy.io/api/lemmatizer#is_base_form
"""
return False
def score(self, examples, **kwargs) -> Dict[str, Any]:
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores.
DOCS: https://spacy.io/api/lemmatizer#score
"""
return Scorer.score_token_attr(examples, "lemma", **kwargs)
def to_disk(self, path, *, exclude=tuple()):
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist.
exclude (list): String names of serialization fields to exclude.
DOCS: https://spacy.io/api/vocab#to_disk
"""
serialize = {}
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
serialize["lookups"] = lambda p: self.lookups.to_disk(p)
util.to_disk(path, serialize, exclude)
def from_disk(self, path, *, exclude=tuple()):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode or Path): A path to a directory.
exclude (list): String names of serialization fields to exclude.
RETURNS (Vocab): The modified `Vocab` object.
DOCS: https://spacy.io/api/vocab#to_disk
"""
deserialize = {}
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
util.from_disk(path, deserialize, exclude)
def to_bytes(self, *, exclude=tuple()) -> bytes:
"""Serialize the current state to a binary string.
exclude (list): String names of serialization fields to exclude.
RETURNS (bytes): The serialized form of the `Vocab` object.
DOCS: https://spacy.io/api/vocab#to_bytes
"""
serialize = {}
serialize["vocab"] = self.vocab.to_bytes
serialize["lookups"] = self.lookups.to_bytes
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data: bytes, *, exclude=tuple()):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
exclude (list): String names of serialization fields to exclude.
RETURNS (Vocab): The `Vocab` object.
DOCS: https://spacy.io/api/vocab#from_bytes
"""
deserialize = {}
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
util.from_bytes(bytes_data, deserialize, exclude)

View File

@ -29,7 +29,6 @@ embed_size = 2000
window_size = 1
maxout_pieces = 2
subword_features = true
dropout = null
"""
DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"]

View File

@ -25,7 +25,6 @@ embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
dropout = null
"""
DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]

View File

@ -25,7 +25,6 @@ embed_size = 2000
window_size = 1
maxout_pieces = 2
subword_features = true
dropout = null
"""
DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]

View File

@ -15,7 +15,7 @@ from .pipe import Pipe
default_model_config = """
[model]
@architectures = "spacy.BiluoTagger.v1"
@architectures = "spacy.BILUOTagger.v1"
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
@ -26,7 +26,6 @@ embed_size = 7000
window_size = 1
maxout_pieces = 3
subword_features = true
dropout = null
"""
DEFAULT_SIMPLE_NER_MODEL = Config().from_str(default_model_config)["model"]

View File

@ -31,7 +31,6 @@ embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
dropout = null
"""
DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
@ -39,12 +38,12 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"tagger",
assigns=["token.tag"],
default_config={"model": DEFAULT_TAGGER_MODEL, "set_morphology": False},
scores=["tag_acc", "pos_acc", "lemma_acc"],
default_config={"model": DEFAULT_TAGGER_MODEL},
scores=["tag_acc"],
default_score_weights={"tag_acc": 1.0},
)
def make_tagger(nlp: Language, name: str, model: Model, set_morphology: bool):
return Tagger(nlp.vocab, model, name, set_morphology=set_morphology)
def make_tagger(nlp: Language, name: str, model: Model):
return Tagger(nlp.vocab, model, name)
class Tagger(Pipe):
@ -52,13 +51,14 @@ class Tagger(Pipe):
DOCS: https://spacy.io/api/tagger
"""
def __init__(self, vocab, model, name="tagger", *, set_morphology=False):
def __init__(self, vocab, model, name="tagger", *, labels=None):
"""Initialize a part-of-speech tagger.
vocab (Vocab): The shared vocabulary.
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
losses during training.
labels (List): The set of labels. Defaults to None.
set_morphology (bool): Whether to set morphological features.
DOCS: https://spacy.io/api/tagger#init
@ -67,7 +67,7 @@ class Tagger(Pipe):
self.model = model
self.name = name
self._rehearsal_model = None
cfg = {"set_morphology": set_morphology}
cfg = {"labels": labels or []}
self.cfg = dict(sorted(cfg.items()))
@property
@ -80,7 +80,7 @@ class Tagger(Pipe):
DOCS: https://spacy.io/api/tagger#labels
"""
return tuple(self.vocab.morphology.tag_names)
return tuple(self.cfg["labels"])
def __call__(self, doc):
"""Apply the pipe to a Doc.
@ -150,9 +150,7 @@ class Tagger(Pipe):
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
cdef int idx = 0
cdef Vocab vocab = self.vocab
assign_morphology = self.cfg.get("set_morphology", True)
for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, "get"):
@ -160,15 +158,7 @@ class Tagger(Pipe):
for j, tag_id in enumerate(doc_tag_ids):
# Don't clobber preset POS tags
if doc.c[j].tag == 0:
if doc.c[j].pos == 0 and assign_morphology:
# Don't clobber preset lemmas
lemma = doc.c[j].lemma
vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
if lemma != 0 and lemma != doc.c[j].lex.orth:
doc.c[j].lemma = lemma
else:
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
idx += 1
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
doc.is_tagged = True
def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
@ -279,55 +269,26 @@ class Tagger(Pipe):
DOCS: https://spacy.io/api/tagger#begin_training
"""
lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
if not any(table in self.vocab.lookups for table in lemma_tables):
warnings.warn(Warnings.W022)
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
langs = ", ".join(util.LEXEME_NORM_LANGS)
warnings.warn(Warnings.W033.format(model="part-of-speech tagger", langs=langs))
orig_tag_map = dict(self.vocab.morphology.tag_map)
new_tag_map = {}
tags = set()
for example in get_examples():
try:
y = example.y
except AttributeError:
raise TypeError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example))) from None
for token in y:
tag = token.tag_
if tag in orig_tag_map:
new_tag_map[tag] = orig_tag_map[tag]
else:
new_tag_map[tag] = {POS: X}
cdef Vocab vocab = self.vocab
if new_tag_map:
if "_SP" in orig_tag_map:
new_tag_map["_SP"] = orig_tag_map["_SP"]
vocab.morphology.load_tag_map(new_tag_map)
tags.add(token.tag_)
for tag in sorted(tags):
self.add_label(tag)
self.set_output(len(self.labels))
doc_sample = [Doc(self.vocab, words=["hello", "world"])]
if pipeline is not None:
for name, component in pipeline:
if component is self:
break
if hasattr(component, "pipe"):
doc_sample = list(component.pipe(doc_sample))
else:
doc_sample = [component(doc) for doc in doc_sample]
self.model.initialize(X=doc_sample)
# Get batch of example docs, example outputs to call begin_training().
# This lets the model infer shapes.
self.model.initialize()
if sgd is None:
sgd = self.create_optimizer()
return sgd
def add_label(self, label, values=None):
def add_label(self, label):
"""Add a new label to the pipe.
label (str): The label to add.
values (Dict[int, str]): Optional values to map to the label, e.g. a
tag map dictionary.
RETURNS (int): 0 if label is already present, otherwise 1.
DOCS: https://spacy.io/api/tagger#add_label
@ -336,22 +297,8 @@ class Tagger(Pipe):
raise ValueError(Errors.E187)
if label in self.labels:
return 0
if self.model.has_dim("nO"):
# Here's how the model resizing will work, once the
# neuron-to-tag mapping is no longer controlled by
# the Morphology class, which sorts the tag names.
# The sorting makes adding labels difficult.
# smaller = self.model._layers[-1]
# larger = Softmax(len(self.labels)+1, smaller.nI)
# copy_array(larger.W[:smaller.nO], smaller.W)
# copy_array(larger.b[:smaller.nO], smaller.b)
# self.model._layers[-1] = larger
raise ValueError(TempErrors.T003)
tag_map = dict(self.vocab.morphology.tag_map)
if values is None:
values = {POS: "X"}
tag_map[label] = values
self.vocab.morphology.load_tag_map(tag_map)
self.cfg["labels"].append(label)
self.vocab.strings.add(label)
return 1
def score(self, examples, **kwargs):
@ -363,11 +310,7 @@ class Tagger(Pipe):
DOCS: https://spacy.io/api/tagger#score
"""
scores = {}
scores.update(Scorer.score_token_attr(examples, "tag", **kwargs))
scores.update(Scorer.score_token_attr(examples, "pos", **kwargs))
scores.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
return scores
return Scorer.score_token_attr(examples, "tag", **kwargs)
def to_bytes(self, *, exclude=tuple()):
"""Serialize the pipe to a bytestring.
@ -381,10 +324,6 @@ class Tagger(Pipe):
serialize["model"] = self.model.to_bytes
serialize["vocab"] = self.vocab.to_bytes
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map)
morph_rules = dict(self.vocab.morphology.exc)
serialize["morph_rules"] = lambda: srsly.msgpack_dumps(morph_rules)
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, *, exclude=tuple()):
@ -402,21 +341,8 @@ class Tagger(Pipe):
except AttributeError:
raise ValueError(Errors.E149) from None
def load_tag_map(b):
tag_map = srsly.msgpack_loads(b)
self.vocab.morphology.load_tag_map(tag_map)
def load_morph_rules(b):
morph_rules = srsly.msgpack_loads(b)
self.vocab.morphology.load_morph_exceptions(morph_rules)
self.vocab.morphology = Morphology(self.vocab.strings, dict(),
lemmatizer=self.vocab.morphology.lemmatizer)
deserialize = {
"vocab": lambda b: self.vocab.from_bytes(b),
"tag_map": load_tag_map,
"morph_rules": load_morph_rules,
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
"model": lambda b: load_model(b),
}
@ -431,12 +357,8 @@ class Tagger(Pipe):
DOCS: https://spacy.io/api/tagger#to_disk
"""
tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
morph_rules = dict(self.vocab.morphology.exc)
serialize = {
"vocab": lambda p: self.vocab.to_disk(p),
"tag_map": lambda p: srsly.write_msgpack(p, tag_map),
"morph_rules": lambda p: srsly.write_msgpack(p, morph_rules),
"model": lambda p: self.model.to_disk(p),
"cfg": lambda p: srsly.write_json(p, self.cfg),
}
@ -458,22 +380,9 @@ class Tagger(Pipe):
except AttributeError:
raise ValueError(Errors.E149) from None
def load_tag_map(p):
tag_map = srsly.read_msgpack(p)
self.vocab.morphology.load_tag_map(tag_map)
def load_morph_rules(p):
morph_rules = srsly.read_msgpack(p)
self.vocab.morphology.load_morph_exceptions(morph_rules)
self.vocab.morphology = Morphology(self.vocab.strings, dict(),
lemmatizer=self.vocab.morphology.lemmatizer)
deserialize = {
"vocab": lambda p: self.vocab.from_disk(p),
"cfg": lambda p: self.cfg.update(deserialize_config(p)),
"tag_map": load_tag_map,
"morph_rules": load_morph_rules,
"model": load_model,
}
util.from_disk(path, deserialize, exclude)

View File

@ -30,8 +30,8 @@ bow_model_config = """
[model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
ngram_size: 1
no_output_layer: false
ngram_size = 1
no_output_layer = false
"""
cnn_model_config = """
@ -48,7 +48,6 @@ embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
dropout = null
"""

View File

@ -20,7 +20,6 @@ embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
dropout = null
"""
DEFAULT_TOK2VEC_MODEL = Config().from_str(default_model_config)["model"]

View File

@ -220,7 +220,6 @@ class ConfigSchemaNlp(BaseModel):
lang: StrictStr = Field(..., title="The base language to use")
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
tokenizer: Callable = Field(..., title="The tokenizer to use")
lemmatizer: Callable = Field(..., title="The lemmatizer to use")
load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")

View File

@ -242,7 +242,8 @@ class Scorer:
per_feat[field].score_set(
pred_per_feat.get(field, set()), gold_per_feat.get(field, set()),
)
return {f"{attr}_per_feat": per_feat}
result = {k: v.to_dict() for k, v in per_feat.items()}
return {f"{attr}_per_feat": result}
@staticmethod
def score_spans(
@ -318,6 +319,7 @@ class Scorer:
labels: Iterable[str] = tuple(),
multi_label: bool = True,
positive_label: Optional[str] = None,
threshold: Optional[float] = None,
**cfg,
) -> Dict[str, Any]:
"""Returns PRF and ROC AUC scores for a doc-level attribute with a
@ -333,94 +335,104 @@ class Scorer:
Defaults to True.
positive_label (str): The positive label for a binary task with
exclusive classes. Defaults to None.
threshold (float): Cutoff to consider a prediction "positive". Defaults
to 0.5 for multi-label, and 0.0 (i.e. whatever's highest scoring)
otherwise.
RETURNS (Dict[str, Any]): A dictionary containing the scores, with
inapplicable scores as None:
for all:
attr_score (one of attr_f / attr_macro_f / attr_macro_auc),
attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
attr_score_desc (text description of the overall score),
attr_micro_f,
attr_macro_f,
attr_auc,
attr_f_per_type,
attr_auc_per_type
for binary exclusive with positive label: attr_p/r/f
for 3+ exclusive classes, macro-averaged fscore: attr_macro_f
for multilabel, macro-averaged AUC: attr_macro_auc
DOCS: https://spacy.io/api/scorer#score_cats
"""
score = PRFScore()
f_per_type = dict()
auc_per_type = dict()
for label in labels:
f_per_type[label] = PRFScore()
auc_per_type[label] = ROCAUCScore()
if threshold is None:
threshold = 0.5 if multi_label else 0.0
f_per_type = {label: PRFScore() for label in labels}
auc_per_type = {label: ROCAUCScore() for label in labels}
labels = set(labels)
if labels:
for eg in examples:
labels.update(eg.predicted.cats.keys())
labels.update(eg.reference.cats.keys())
for example in examples:
gold_doc = example.reference
pred_doc = example.predicted
gold_values = getter(gold_doc, attr)
pred_values = getter(pred_doc, attr)
if (
len(gold_values) > 0
and set(f_per_type) == set(auc_per_type) == set(gold_values)
and set(gold_values) == set(pred_values)
):
gold_val = max(gold_values, key=gold_values.get)
pred_val = max(pred_values, key=pred_values.get)
if positive_label:
score.score_set(
set([positive_label]) & set([pred_val]),
set([positive_label]) & set([gold_val]),
)
for label in set(gold_values):
auc_per_type[label].score_set(
pred_values[label], gold_values[label]
)
f_per_type[label].score_set(
set([label]) & set([pred_val]), set([label]) & set([gold_val])
)
elif len(f_per_type) > 0:
model_labels = set(f_per_type)
eval_labels = set(gold_values)
raise ValueError(
Errors.E162.format(
model_labels=model_labels, eval_labels=eval_labels
)
)
elif len(auc_per_type) > 0:
model_labels = set(auc_per_type)
eval_labels = set(gold_values)
raise ValueError(
Errors.E162.format(
model_labels=model_labels, eval_labels=eval_labels
)
)
# Through this loop, None in the gold_cats indicates missing label.
pred_cats = getter(example.predicted, attr)
gold_cats = getter(example.reference, attr)
# I think the AUC metric is applicable regardless of whether we're
# doing multi-label classification? Unsure. If not, move this into
# the elif pred_cats and gold_cats block below.
for label in labels:
pred_score = pred_cats.get(label, 0.0)
gold_score = gold_cats.get(label, 0.0)
if gold_score is not None:
auc_per_type[label].score_set(pred_score, gold_score)
if multi_label:
for label in labels:
pred_score = pred_cats.get(label, 0.0)
gold_score = gold_cats.get(label, 0.0)
if gold_score is not None:
if pred_score >= threshold and gold_score > 0:
f_per_type[label].tp += 1
elif pred_score >= threshold and gold_score == 0:
f_per_type[label].fp += 1
elif pred_score < threshold and gold_score > 0:
f_per_type[label].fn += 1
elif pred_cats and gold_cats:
# Get the highest-scoring for each.
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
if gold_score is not None:
if pred_label == gold_label and pred_score >= threshold:
f_per_type[pred_label].tp += 1
else:
f_per_type[gold_label].fn += 1
if pred_score >= threshold:
f_per_type[pred_label].fp += 1
elif gold_cats:
gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
if gold_score is not None and gold_score > 0:
f_per_type[gold_label].fn += 1
else:
pred_label, pred_score = max(pred_cats, key=lambda it: it[1])
if pred_score >= threshold:
f_per_type[pred_label].fp += 1
micro_prf = PRFScore()
for label_prf in f_per_type.values():
micro_prf.tp = label_prf.tp
micro_prf.fn = label_prf.fn
micro_prf.fp = label_prf.fp
n_cats = len(f_per_type) + 1e-100
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
results = {
f"{attr}_score": None,
f"{attr}_score_desc": None,
f"{attr}_p": None,
f"{attr}_r": None,
f"{attr}_f": None,
f"{attr}_macro_f": None,
f"{attr}_micro_p": micro_prf.precision,
f"{attr}_micro_r": micro_prf.recall,
f"{attr}_micro_f": micro_prf.fscore,
f"{attr}_macro_p": macro_p,
f"{attr}_macro_r": macro_r,
f"{attr}_macro_f": macro_f,
f"{attr}_macro_auc": None,
f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
}
if len(labels) == 2 and not multi_label and positive_label:
results[f"{attr}_p"] = score.precision
results[f"{attr}_r"] = score.recall
results[f"{attr}_f"] = score.fscore
results[f"{attr}_score"] = results[f"{attr}_f"]
positive_label_f = results[f"{attr}_f_per_type"][positive_label]['f']
results[f"{attr}_score"] = positive_label_f
results[f"{attr}_score_desc"] = f"F ({positive_label})"
elif not multi_label:
results[f"{attr}_macro_f"] = sum(
[score.fscore for label, score in f_per_type.items()]
) / (len(f_per_type) + 1e-100)
results[f"{attr}_score"] = results[f"{attr}_macro_f"]
results[f"{attr}_score_desc"] = "macro F"
else:
results[f"{attr}_macro_auc"] = max(
sum([score.score for label, score in auc_per_type.items()])
/ (len(auc_per_type) + 1e-100),
-1,
)
results[f"{attr}_score"] = results[f"{attr}_macro_auc"]
results[f"{attr}_score_desc"] = "macro AUC"
return results

View File

@ -201,7 +201,7 @@ def ru_tokenizer():
@pytest.fixture
def ru_lemmatizer():
pytest.importorskip("pymorphy2")
return get_lang_class("ru")().vocab.morphology.lemmatizer
return get_lang_class("ru")().add_pipe("lemmatizer")
@pytest.fixture(scope="session")

View File

@ -1,21 +1,12 @@
import pytest
from spacy.vocab import Vocab
from spacy.tokens import Doc
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
from spacy import util
@pytest.fixture
def lemmatizer():
lookups = Lookups()
lookups.add_table("lemma_lookup", {"dogs": "dog", "boxen": "box", "mice": "mouse"})
return Lemmatizer(lookups)
@pytest.fixture
def vocab(lemmatizer):
return Vocab(lemmatizer=lemmatizer)
def vocab():
return Vocab()
def test_empty_doc(vocab):
@ -30,14 +21,6 @@ def test_single_word(vocab):
assert doc.text == "a"
def test_lookup_lemmatization(vocab):
doc = Doc(vocab, words=["dogs", "dogses"])
assert doc[0].text == "dogs"
assert doc[0].lemma_ == "dog"
assert doc[1].text == "dogses"
assert doc[1].lemma_ == "dogses"
def test_create_from_words_and_text(vocab):
# no whitespace in words
words = ["'", "dogs", "'", "run"]

View File

@ -1,23 +1,17 @@
import pytest
from spacy.symbols import POS, PRON, VERB
@pytest.fixture
def i_has(en_tokenizer):
doc = en_tokenizer("I has")
tag_map = {
"PRP": {POS: PRON, "PronType": "prs"},
"VBZ": {
POS: VERB,
"VerbForm": "fin",
"Tense": "pres",
"Number": "sing",
"Person": "three",
},
doc[0].morph_ = {"PronType": "prs"}
doc[1].morph_ = {
"VerbForm": "fin",
"Tense": "pres",
"Number": "sing",
"Person": "three",
}
en_tokenizer.vocab.morphology.load_tag_map(tag_map)
doc[0].tag_ = "PRP"
doc[1].tag_ = "VBZ"
return doc

View File

@ -124,7 +124,6 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
assert doc[0].text == "The players"
assert doc[0].tag_ == "NN"
assert doc[0].pos_ == "NOUN"
assert doc[0].lemma_ == "The players"
doc = get_doc(
tokens.vocab,
words=[t.text for t in tokens],
@ -143,11 +142,9 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
assert doc[0].text == "The players"
assert doc[0].tag_ == "NN"
assert doc[0].pos_ == "NOUN"
assert doc[0].lemma_ == "The players"
assert doc[1].text == "start ."
assert doc[1].tag_ == "VBZ"
assert doc[1].pos_ == "VERB"
assert doc[1].lemma_ == "start ."
def test_doc_retokenize_spans_merge_heads(en_tokenizer):

View File

@ -1,21 +0,0 @@
from spacy.symbols import POS, PRON, VERB, DET, NOUN, PUNCT
from ...util import get_doc
def test_en_tagger_load_morph_exc(en_tokenizer):
text = "I like his style."
tags = ["PRP", "VBP", "PRP$", "NN", "."]
tag_map = {
"PRP": {POS: PRON},
"VBP": {POS: VERB},
"PRP$": {POS: DET},
"NN": {POS: NOUN},
".": {POS: PUNCT},
}
morph_exc = {"VBP": {"like": {"lemma": "luck"}}}
en_tokenizer.vocab.morphology.load_tag_map(tag_map)
en_tokenizer.vocab.morphology.load_morph_exceptions(morph_exc)
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags)
assert doc[1].tag_ == "VBP"
assert doc[1].lemma_ == "luck"

View File

@ -3,15 +3,16 @@ import pytest
from ...util import get_doc
@pytest.mark.xfail(reason="TODO: investigate why lemmatizer fails here")
def test_ru_doc_lemmatization(ru_tokenizer):
def test_ru_doc_lemmatization(ru_lemmatizer):
words = ["мама", "мыла", "раму"]
tags = [
"NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
"VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
"NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
pos = ["NOUN", "VERB", "NOUN"]
morphs = [
"Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
"Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
"Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
]
doc = get_doc(ru_tokenizer.vocab, words=words, tags=tags)
doc = get_doc(ru_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
doc = ru_lemmatizer(doc)
lemmas = [token.lemma_ for token in doc]
assert lemmas == ["мама", "мыть", "рама"]
@ -27,43 +28,51 @@ def test_ru_doc_lemmatization(ru_tokenizer):
],
)
def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
assert sorted(ru_lemmatizer.noun(text)) == lemmas
doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"])
result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
assert sorted(result_lemmas) == lemmas
@pytest.mark.parametrize(
"text,pos,morphology,lemma",
"text,pos,morph,lemma",
[
("рой", "NOUN", None, "рой"),
("рой", "VERB", None, "рыть"),
("клей", "NOUN", None, "клей"),
("клей", "VERB", None, "клеить"),
("три", "NUM", None, "три"),
("кос", "NOUN", {"Number": "Sing"}, "кос"),
("кос", "NOUN", {"Number": "Plur"}, "коса"),
("кос", "ADJ", None, "косой"),
("потом", "NOUN", None, "пот"),
("потом", "ADV", None, "потом"),
("рой", "NOUN", "", "рой"),
("рой", "VERB", "", "рыть"),
("клей", "NOUN", "", "клей"),
("клей", "VERB", "", "клеить"),
("три", "NUM", "", "три"),
("кос", "NOUN", "Number=Sing", "кос"),
("кос", "NOUN", "Number=Plur", "коса"),
("кос", "ADJ", "", "косой"),
("потом", "NOUN", "", "пот"),
("потом", "ADV", "", "потом"),
],
)
def test_ru_lemmatizer_works_with_different_pos_homonyms(
ru_lemmatizer, text, pos, morphology, lemma
ru_lemmatizer, text, pos, morph, lemma
):
assert ru_lemmatizer(text, pos, morphology) == [lemma]
doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=[pos], morphs=[morph])
result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
assert result_lemmas == [lemma]
@pytest.mark.parametrize(
"text,morphology,lemma",
"text,morph,lemma",
[
("гвоздики", {"Gender": "Fem"}, "гвоздика"),
("гвоздики", {"Gender": "Masc"}, "гвоздик"),
("вина", {"Gender": "Fem"}, "вина"),
("вина", {"Gender": "Neut"}, "вино"),
("гвоздики", "Gender=Fem", "гвоздика"),
("гвоздики", "Gender=Masc", "гвоздик"),
("вина", "Gender=Fem", "вина"),
("вина", "Gender=Neut", "вино"),
],
)
def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morphology, lemma):
assert ru_lemmatizer.noun(text, morphology) == [lemma]
def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morph, lemma):
doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"], morphs=[morph])
result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
assert result_lemmas == [lemma]
def test_ru_lemmatizer_punct(ru_lemmatizer):
assert ru_lemmatizer.punct("«") == ['"']
assert ru_lemmatizer.punct("»") == ['"']
doc = get_doc(ru_lemmatizer.vocab, words=["«"], pos=["PUNCT"])
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
doc = get_doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']

View File

@ -0,0 +1,34 @@
import pytest
from spacy import registry
from spacy.lookups import Lookups
from spacy.util import get_lang_class
# fmt: off
# Only include languages with no external dependencies
# excluded: ru, uk
# excluded for custom tables: pl
LANGUAGES = ["el", "en", "fr", "nl"]
# fmt: on
@pytest.mark.parametrize("lang", LANGUAGES)
def test_lemmatizer_initialize(lang, capfd):
@registry.assets("lemmatizer_init_lookups")
def lemmatizer_init_lookups():
lookups = Lookups()
lookups.add_table("lemma_lookup", {"cope": "cope"})
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
return lookups
"""Test that languages can be initialized."""
nlp = get_lang_class(lang)()
nlp.add_pipe(
"lemmatizer", config={"lookups": {"@assets": "lemmatizer_init_lookups"}}
)
# Check for stray print statements (see #3342)
doc = nlp("test") # noqa: F841
captured = capfd.readouterr()
assert not captured.out

View File

@ -1,14 +1,11 @@
import pytest
from spacy.morphology import Morphology
from spacy.strings import StringStore, get_string_id
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
@pytest.fixture
def morphology():
lemmatizer = Lemmatizer(Lookups())
return Morphology(StringStore(), {}, lemmatizer)
return Morphology(StringStore())
def test_init(morphology):

View File

@ -2,21 +2,18 @@ import pytest
import pickle
from spacy.morphology import Morphology
from spacy.strings import StringStore
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
@pytest.fixture
def morphology():
tag_map = {"A": {"POS": "X"}, "B": {"POS": "NOUN"}}
exc = {"A": {"a": {"POS": "VERB"}}}
lemmatizer = Lemmatizer(Lookups())
return Morphology(StringStore(), tag_map, lemmatizer, exc=exc)
morphology = Morphology(StringStore())
morphology.add("Feat1=Val1|Feat2=Val2")
morphology.add("Feat3=Val3|Feat4=Val4")
return morphology
def test_morphology_pickle_roundtrip(morphology):
b = pickle.dumps(morphology)
reloaded_morphology = pickle.loads(b)
assert morphology.tag_map == reloaded_morphology.tag_map
assert morphology.exc == reloaded_morphology.exc
assert reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"]) == "Feat1=Val1|Feat2=Val2"
assert reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"]) == "Feat3=Val3|Feat4=Val4"

View File

@ -82,10 +82,10 @@ def test_parser_merge_pp(en_tokenizer):
text = "A phrase with another phrase occurs"
heads = [1, 4, -1, 1, -2, 0]
deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT"]
tags = ["DT", "NN", "IN", "DT", "NN", "VBZ"]
pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB"]
tokens = en_tokenizer(text)
doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags
tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos,
)
with doc.retokenize() as retokenizer:
for np in doc.noun_chunks:

View File

@ -0,0 +1,109 @@
import pytest
from spacy import util, registry
from spacy.lang.en import English
from spacy.lookups import Lookups, load_lookups
from ..util import make_tempdir
@pytest.fixture
def nlp():
return English()
@pytest.fixture
def lemmatizer(nlp):
@registry.assets("cope_lookups")
def cope_lookups():
lookups = Lookups()
lookups.add_table("lemma_lookup", {"cope": "cope"})
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
return lookups
lemmatizer = nlp.add_pipe(
"lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}}
)
return lemmatizer
def test_lemmatizer_init(nlp):
@registry.assets("cope_lookups")
def cope_lookups():
lookups = Lookups()
lookups.add_table("lemma_lookup", {"cope": "cope"})
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
return lookups
lemmatizer = nlp.add_pipe(
"lemmatizer", config={"mode": "lookup", "lookups": {"@assets": "cope_lookups"}}
)
assert isinstance(lemmatizer.lookups, Lookups)
assert lemmatizer.mode == "lookup"
# replace any tables from spacy-lookups-data
lemmatizer.lookups = Lookups()
doc = nlp("coping")
# lookup with no tables sets text as lemma
assert doc[0].lemma_ == "coping"
nlp.remove_pipe("lemmatizer")
@registry.assets("empty_lookups")
def empty_lookups():
return Lookups()
with pytest.raises(ValueError):
nlp.add_pipe(
"lemmatizer",
config={"mode": "lookup", "lookups": {"@assets": "empty_lookups"}},
)
def test_lemmatizer_config(nlp, lemmatizer):
doc = nlp.make_doc("coping")
doc[0].pos_ = "VERB"
assert doc[0].lemma_ == ""
doc = lemmatizer(doc)
assert doc[0].text == "coping"
assert doc[0].lemma_ == "cope"
doc = nlp.make_doc("coping")
doc[0].pos_ = "VERB"
assert doc[0].lemma_ == ""
doc = lemmatizer(doc)
assert doc[0].text == "coping"
assert doc[0].lemma_ == "cope"
def test_lemmatizer_serialize(nlp, lemmatizer):
@registry.assets("cope_lookups")
def cope_lookups():
lookups = Lookups()
lookups.add_table("lemma_lookup", {"cope": "cope"})
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
return lookups
nlp2 = English()
lemmatizer2 = nlp2.add_pipe(
"lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}}
)
lemmatizer2.from_bytes(lemmatizer.to_bytes())
assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables
# Also test the results are still the same after IO
with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir)
doc2 = nlp2.make_doc("coping")
doc2[0].pos_ = "VERB"
assert doc2[0].lemma_ == ""
doc2 = lemmatizer(doc2)
assert doc2[0].text == "coping"
assert doc2[0].lemma_ == "cope"

View File

@ -23,13 +23,12 @@ def test_tagger_begin_training_tag_map():
nlp = Language()
tagger = nlp.add_pipe("tagger")
orig_tag_count = len(tagger.labels)
tagger.add_label("A", {"POS": "NOUN"})
tagger.add_label("A")
nlp.begin_training()
assert nlp.vocab.morphology.tag_map["A"] == {POS: NOUN}
assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}}
TAGS = ("N", "V", "J")
MORPH_RULES = {"V": {"like": {"lemma": "luck"}}}
@ -42,15 +41,12 @@ TRAIN_DATA = [
def test_overfitting_IO():
# Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
nlp = English()
nlp.vocab.morphology.load_tag_map(TAG_MAP)
nlp.vocab.morphology.load_morph_exceptions(MORPH_RULES)
tagger = nlp.add_pipe("tagger", config={"set_morphology": True})
nlp.vocab.morphology.load_tag_map(TAG_MAP)
tagger = nlp.add_pipe("tagger")
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
for tag, values in TAG_MAP.items():
tagger.add_label(tag, values)
for tag in TAGS:
tagger.add_label(tag)
optimizer = nlp.begin_training()
for i in range(50):
@ -65,7 +61,6 @@ def test_overfitting_IO():
assert doc[1].tag_ is "V"
assert doc[2].tag_ is "J"
assert doc[3].tag_ is "N"
assert doc[1].lemma_ == "luck"
# Also test the results are still the same after IO
with make_tempdir() as tmp_dir:
@ -76,4 +71,3 @@ def test_overfitting_IO():
assert doc2[1].tag_ is "V"
assert doc2[2].tag_ is "J"
assert doc2[3].tag_ is "N"
assert doc[1].lemma_ == "luck"

View File

@ -117,8 +117,10 @@ def test_overfitting_IO():
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
# Test scoring
scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
assert scores["cats_f"] == 1.0
scores = nlp.evaluate(
train_examples, scorer_cfg={"positive_label": "POSITIVE"}
)
assert scores["cats_micro_f"] == 1.0
assert scores["cats_score"] == 1.0
assert "cats_score_desc" in scores

View File

@ -8,10 +8,8 @@ from spacy.attrs import IS_PUNCT, ORTH, LOWER
from spacy.symbols import POS, VERB
from spacy.vocab import Vocab
from spacy.lang.en import English
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
from spacy.tokens import Doc, Span
from spacy.lang.en.lemmatizer import is_base_form
from ..util import get_doc, make_tempdir
@ -157,16 +155,15 @@ def test_issue590(en_vocab):
assert len(matches) == 2
@pytest.mark.skip(reason="Old vocab-based lemmatization")
def test_issue595():
"""Test lemmatization of base forms"""
words = ["Do", "n't", "feed", "the", "dog"]
tag_map = {"VB": {POS: VERB, "VerbForm": "inf"}}
lookups = Lookups()
lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]})
lookups.add_table("lemma_index", {"verb": {}})
lookups.add_table("lemma_exc", {"verb": {}})
lemmatizer = Lemmatizer(lookups, is_base_form=is_base_form)
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
vocab = Vocab()
doc = Doc(vocab, words=words)
doc[2].tag_ = "VB"
assert doc[2].text == "feed"
@ -389,6 +386,7 @@ def test_issue891(en_tokenizer, text):
assert tokens[1].text == "/"
@pytest.mark.skip(reason="Old vocab-based lemmatization")
@pytest.mark.parametrize(
"text,tag,lemma",
[("anus", "NN", "anus"), ("princess", "NN", "princess"), ("inner", "JJ", "inner")],

View File

@ -6,7 +6,6 @@ from spacy.lang.en import English
from spacy.lang.lex_attrs import LEX_ATTRS
from spacy.matcher import Matcher
from spacy.tokenizer import Tokenizer
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
from spacy.symbols import ORTH, LEMMA, POS, VERB
@ -57,6 +56,7 @@ def test_issue1242():
assert len(docs[1]) == 1
@pytest.mark.skip(reason="v3 no longer supports LEMMA/POS in tokenizer special cases")
def test_issue1250():
"""Test cached special cases."""
special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}]
@ -87,20 +87,6 @@ def test_issue1375():
assert doc[1].nbor(1).text == "2"
def test_issue1387():
tag_map = {"VBG": {POS: VERB, "VerbForm": "part"}}
lookups = Lookups()
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
lemmatizer = Lemmatizer(lookups)
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
doc = Doc(vocab, words=["coping"])
doc[0].tag_ = "VBG"
assert doc[0].text == "coping"
assert doc[0].lemma_ == "cope"
def test_issue1434():
"""Test matches occur when optional element at end of short doc."""
pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}]

View File

@ -130,8 +130,6 @@ def test_issue1727():
vectors = Vectors(data=data, keys=["I", "am", "Matt"])
tagger = nlp.create_pipe("tagger")
tagger.add_label("PRP")
with pytest.warns(UserWarning):
tagger.begin_training()
assert tagger.cfg.get("pretrained_dims", 0) == 0
tagger.vocab.vectors = vectors
with make_tempdir() as path:

View File

@ -19,8 +19,8 @@ def test_issue2564():
"""Test the tagger sets is_tagged correctly when used via Language.pipe."""
nlp = Language()
tagger = nlp.add_pipe("tagger")
with pytest.warns(UserWarning):
tagger.begin_training() # initialise weights
tagger.add_label("A")
tagger.begin_training()
doc = nlp("hello world")
assert doc.is_tagged
docs = nlp.pipe(["hello", "world"])

View File

@ -241,11 +241,11 @@ def test_issue3449():
assert t3[5].text == "I"
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue3456():
# this crashed because of a padding error in layer.ops.unflatten in thinc
nlp = English()
nlp.add_pipe("tagger")
tagger = nlp.add_pipe("tagger")
tagger.add_label("A")
nlp.begin_training()
list(nlp.pipe(["hi", ""]))

View File

@ -149,13 +149,15 @@ def test_issue3540(en_vocab):
gold_text = ["I", "live", "in", "NewYork", "right", "now"]
assert [token.text for token in doc] == gold_text
gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
for i, lemma in enumerate(gold_lemma):
doc[i].lemma_ = lemma
assert [token.lemma_ for token in doc] == gold_lemma
vectors_1 = [token.vector for token in doc]
assert len(vectors_1) == len(doc)
with doc.retokenize() as retokenizer:
heads = [(doc[3], 1), doc[2]]
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
attrs = {"POS": ["PROPN", "PROPN"], "LEMMA": ["New", "York"], "DEP": ["pobj", "compound"]}
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
gold_text = ["I", "live", "in", "New", "York", "right", "now"]

View File

@ -271,6 +271,7 @@ def test_issue4267():
assert token.ent_iob == 2
@pytest.mark.skip(reason="lemmatizer lookups no longer in vocab")
def test_issue4272():
"""Test that lookup table can be accessed from Token.lemma if no POS tags
are available."""

View File

@ -62,8 +62,7 @@ def tagger():
# need to add model for two reasons:
# 1. no model leads to error in serialization,
# 2. the affected line is the one for model serialization
with pytest.warns(UserWarning):
tagger.begin_training(pipeline=nlp.pipeline)
tagger.begin_training(pipeline=nlp.pipeline)
return tagger

View File

@ -48,7 +48,6 @@ window_size = 1
embed_size = 2000
maxout_pieces = 3
subword_features = true
dropout = null
[components.tagger]
factory = "tagger"
@ -78,7 +77,6 @@ embed_size = 5555
window_size = 1
maxout_pieces = 7
subword_features = false
dropout = null
"""

View File

@ -44,8 +44,8 @@ def blank_parser(en_vocab):
def taggers(en_vocab):
cfg = {"model": DEFAULT_TAGGER_MODEL}
model = registry.make_from_config(cfg, validate=True)["model"]
tagger1 = Tagger(en_vocab, model, set_morphology=True)
tagger2 = Tagger(en_vocab, model, set_morphology=True)
tagger1 = Tagger(en_vocab, model)
tagger2 = Tagger(en_vocab, model)
return tagger1, tagger2
@ -125,8 +125,8 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
tagger2.to_disk(file_path2)
cfg = {"model": DEFAULT_TAGGER_MODEL}
model = registry.make_from_config(cfg, validate=True)["model"]
tagger1_d = Tagger(en_vocab, model, set_morphology=True).from_disk(file_path1)
tagger2_d = Tagger(en_vocab, model, set_morphology=True).from_disk(file_path2)
tagger1_d = Tagger(en_vocab, model).from_disk(file_path1)
tagger2_d = Tagger(en_vocab, model).from_disk(file_path2)
assert tagger1_d.to_bytes() == tagger2_d.to_bytes()

View File

@ -8,7 +8,6 @@ from ..util import make_tempdir
test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])]
test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
default_strings = ("_SP", "POS=SPACE")
@pytest.mark.parametrize("text", ["rat"])
@ -34,10 +33,8 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
assert vocab1.to_bytes() == vocab1_b
new_vocab1 = Vocab().from_bytes(vocab1_b)
assert new_vocab1.to_bytes() == vocab1_b
assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE
assert sorted([s for s in new_vocab1.strings]) == sorted(
strings1 + list(default_strings)
)
assert len(new_vocab1.strings) == len(strings1)
assert sorted([s for s in new_vocab1.strings]) == sorted(strings1)
@pytest.mark.parametrize("strings1,strings2", test_strings)
@ -52,16 +49,12 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
vocab1_d = Vocab().from_disk(file_path1)
vocab2_d = Vocab().from_disk(file_path2)
# check strings rather than lexemes, which are only reloaded on demand
assert strings1 == [s for s in vocab1_d.strings if s not in default_strings]
assert strings2 == [s for s in vocab2_d.strings if s not in default_strings]
assert strings1 == [s for s in vocab1_d.strings]
assert strings2 == [s for s in vocab2_d.strings]
if strings1 == strings2:
assert [s for s in vocab1_d.strings if s not in default_strings] == [
s for s in vocab2_d.strings if s not in default_strings
]
assert [s for s in vocab1_d.strings] == [s for s in vocab2_d.strings]
else:
assert [s for s in vocab1_d.strings if s not in default_strings] != [
s for s in vocab2_d.strings if s not in default_strings
]
assert [s for s in vocab1_d.strings] != [s for s in vocab2_d.strings]
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
@ -80,7 +73,7 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr):
# Reported in #2153
vocab = Vocab(strings=strings)
vocab.from_bytes(vocab.to_bytes())
assert len(vocab.strings) == len(strings) + 2 # adds _SP and POS=SPACE
assert len(vocab.strings) == len(strings)
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)

View File

@ -484,12 +484,10 @@ def test_roundtrip_docs_to_docbin(doc):
json_file = tmpdir / "roundtrip.json"
srsly.write_json(json_file, [docs_to_json(doc)])
output_file = tmpdir / "roundtrip.spacy"
data = DocBin(docs=[doc]).to_bytes()
with output_file.open("wb") as file_:
file_.write(data)
DocBin(docs=[doc]).to_disk(output_file)
reader = Corpus(output_file)
reloaded_examples = list(reader(reloaded_nlp))
assert len(doc) == sum(len(eg) for eg in reloaded_examples)
assert len(doc) == sum(len(eg) for eg in reloaded_examples)
reloaded_example = reloaded_examples[0]
assert text == reloaded_example.reference.text
assert idx == [t.idx for t in reloaded_example.reference]
@ -512,13 +510,11 @@ def test_make_orth_variants(doc):
nlp = English()
with make_tempdir() as tmpdir:
output_file = tmpdir / "roundtrip.spacy"
data = DocBin(docs=[doc]).to_bytes()
with output_file.open("wb") as file_:
file_.write(data)
DocBin(docs=[doc]).to_disk(output_file)
# due to randomness, test only that this runs with no errors for now
reader = Corpus(output_file)
train_example = next(reader(nlp))
make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
@pytest.mark.skip("Outdated")

View File

@ -1,64 +0,0 @@
import pytest
from spacy.tokens import Doc
from spacy.language import Language
from spacy.lookups import Lookups
from spacy.lemmatizer import Lemmatizer
@pytest.mark.skip(reason="We probably don't want to support this anymore in v3?")
def test_lemmatizer_reflects_lookups_changes():
"""Test for an issue that'd cause lookups available in a model loaded from
disk to not be reflected in the lemmatizer."""
nlp = Language()
assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "foo"
table = nlp.vocab.lookups.add_table("lemma_lookup")
table["foo"] = "bar"
assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "bar"
table = nlp.vocab.lookups.get_table("lemma_lookup")
table["hello"] = "world"
# The update to the table should be reflected in the lemmatizer
assert Doc(nlp.vocab, words=["hello"])[0].lemma_ == "world"
new_nlp = Language()
table = new_nlp.vocab.lookups.add_table("lemma_lookup")
table["hello"] = "hi"
assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "hi"
nlp_bytes = nlp.to_bytes()
new_nlp.from_bytes(nlp_bytes)
# Make sure we have the previously saved lookup table
assert "lemma_lookup" in new_nlp.vocab.lookups
assert len(new_nlp.vocab.lookups.get_table("lemma_lookup")) == 2
assert new_nlp.vocab.lookups.get_table("lemma_lookup")["hello"] == "world"
assert Doc(new_nlp.vocab, words=["foo"])[0].lemma_ == "bar"
assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world"
def test_tagger_warns_no_lookups():
nlp = Language()
nlp.vocab.lookups = Lookups()
assert not len(nlp.vocab.lookups)
tagger = nlp.add_pipe("tagger")
with pytest.warns(UserWarning):
tagger.begin_training()
with pytest.warns(UserWarning):
nlp.begin_training()
nlp.vocab.lookups.add_table("lemma_lookup")
nlp.vocab.lookups.add_table("lexeme_norm")
nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
with pytest.warns(None) as record:
nlp.begin_training()
assert not record.list
def test_lemmatizer_without_is_base_form_implementation():
# Norwegian example from #5658
lookups = Lookups()
lookups.add_table("lemma_rules", {"noun": []})
lookups.add_table("lemma_index", {"noun": {}})
lookups.add_table("lemma_exc", {"noun": {"formuesskatten": ["formuesskatt"]}})
lemmatizer = Lemmatizer(lookups, is_base_form=None)
assert lemmatizer(
"Formuesskatten",
"noun",
{"Definite": "def", "Gender": "masc", "Number": "sing"},
) == ["formuesskatt"]

View File

@ -259,7 +259,7 @@ def test_tag_score(tagged_doc):
assert results["tag_acc"] == 1.0
assert results["pos_acc"] == 1.0
assert results["morph_acc"] == 1.0
assert results["morph_per_feat"]["NounType"].fscore == 1.0
assert results["morph_per_feat"]["NounType"]["f"] == 1.0
# Gold annotation is modified
scorer = Scorer()
@ -282,9 +282,9 @@ def test_tag_score(tagged_doc):
assert results["tag_acc"] == 0.9
assert results["pos_acc"] == 0.9
assert results["morph_acc"] == approx(0.8)
assert results["morph_per_feat"]["NounType"].fscore == 1.0
assert results["morph_per_feat"]["Poss"].fscore == 0.0
assert results["morph_per_feat"]["Number"].fscore == approx(0.72727272)
assert results["morph_per_feat"]["NounType"]["f"] == 1.0
assert results["morph_per_feat"]["Poss"]["f"] == 0.0
assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
def test_roc_auc_score():

View File

@ -112,16 +112,15 @@ def test_tokenizer_validate_special_case(tokenizer, text, tokens):
@pytest.mark.parametrize(
"text,tokens", [("lorem", [{"orth": "lo", "tag": "NN"}, {"orth": "rem"}])]
"text,tokens", [("lorem", [{"orth": "lo", "norm": "LO"}, {"orth": "rem"}])]
)
def test_tokenizer_add_special_case_tag(text, tokens):
vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
vocab = Vocab()
tokenizer = Tokenizer(vocab, {}, None, None, None)
tokenizer.add_special_case(text, tokens)
doc = tokenizer(text)
assert doc[0].text == tokens[0]["orth"]
assert doc[0].tag_ == tokens[0]["tag"]
assert doc[0].pos_ == "NOUN"
assert doc[0].norm_ == tokens[0]["norm"]
assert doc[1].text == tokens[1]["orth"]

View File

@ -9,7 +9,6 @@ from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
cimport cython
from typing import Dict, List, Union, Pattern, Optional, Any
import re
import warnings

View File

@ -11,7 +11,7 @@ from .span cimport Span
from .token cimport Token
from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..structs cimport LexemeC, TokenC
from ..attrs cimport TAG, MORPH
from ..attrs cimport MORPH
from ..vocab cimport Vocab
from .underscore import is_writable_attr
@ -365,8 +365,6 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
doc[token_index + i]._.set(ext_attr_key, ext_attr_value)
# NB: We need to call get_string_id here because only the keys are
# "intified" (since we support "KEY": [value, value] syntax here).
elif attr_name == TAG:
doc.vocab.morphology.assign_tag(token, get_string_id(attr_value))
else:
# Set attributes on both token and lexeme to take care of token
# attribute vs. lexical attribute without having to enumerate
@ -431,8 +429,6 @@ def set_token_attrs(Token py_token, attrs):
if attr_name == "_": # Set extension attributes
for ext_attr_key, ext_attr_value in attr_value.items():
py_token._.set(ext_attr_key, ext_attr_value)
elif attr_name == TAG:
doc.vocab.morphology.assign_tag(token, attr_value)
else:
# Set attributes on both token and lexeme to take care of token
# attribute vs. lexical attribute without having to enumerate

View File

@ -1,4 +1,5 @@
from typing import Iterable, Iterator
from typing import Iterable, Iterator, Union
from pathlib import Path
import numpy
import zlib
import srsly
@ -9,6 +10,7 @@ from ..vocab import Vocab
from ..compat import copy_reg
from ..attrs import SPACY, ORTH, intify_attr
from ..errors import Errors
from ..util import ensure_path
# fmt: off
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
@ -204,6 +206,30 @@ class DocBin:
assert len(tokens.shape) == 2, tokens.shape # this should never happen
return self
def to_disk(self, path: Union[str, Path]) -> None:
"""Save the DocBin to a file (typically called .spacy).
path (str / Path): The file path.
DOCS: https://spacy.io/api/docbin#to_disk
"""
path = ensure_path(path)
with path.open("wb") as file_:
file_.write(self.to_bytes())
def from_disk(self, path: Union[str, Path]) -> "DocBin":
"""Load the DocBin from a file (typically called .spacy).
path (str / Path): The file path.
RETURNS (DocBin): The loaded DocBin.
DOCS: https://spacy.io/api/docbin#to_disk
"""
path = ensure_path(path)
with path.open("rb") as file_:
self.from_bytes(file_.read())
return self
def merge_bins(bins):
merged = None

View File

@ -832,13 +832,6 @@ cdef class Doc:
rel_head_index=abs_head_index-i
)
)
# Do TAG first. This lets subsequent loop override stuff like POS, LEMMA
if TAG in attrs:
col = attrs.index(TAG)
for i in range(length):
value = values[col * stride + i]
if value != 0:
self.vocab.morphology.assign_tag(&tokens[i], value)
# Verify ENT_IOB are proper integers
if ENT_IOB in attrs:
iob_strings = Token.iob_strings()
@ -857,12 +850,11 @@ cdef class Doc:
for i in range(length):
token = &self.c[i]
for j in range(n_attrs):
if attr_ids[j] != TAG:
value = values[j * stride + i]
if attr_ids[j] == MORPH:
# add morph to morphology table
self.vocab.morphology.add(self.vocab.strings[value])
Token.set_struct_attr(token, attr_ids[j], value)
value = values[j * stride + i]
if attr_ids[j] == MORPH:
# add morph to morphology table
self.vocab.morphology.add(self.vocab.strings[value])
Token.set_struct_attr(token, attr_ids[j], value)
# Set flags
self.is_parsed = bool(self.is_parsed or HEAD in attrs)
self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)

View File

@ -332,11 +332,7 @@ cdef class Token:
inflectional suffixes.
"""
def __get__(self):
if self.c.lemma == 0:
lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth)
return self.vocab.strings[lemma_]
else:
return self.c.lemma
return self.c.lemma
def __set__(self, attr_t lemma):
self.c.lemma = lemma
@ -355,7 +351,7 @@ cdef class Token:
return self.c.tag
def __set__(self, attr_t tag):
self.vocab.morphology.assign_tag(self.c, tag)
self.c.tag = tag
property dep:
"""RETURNS (uint64): ID of syntactic dependency label."""
@ -888,10 +884,7 @@ cdef class Token:
with no inflectional suffixes.
"""
def __get__(self):
if self.c.lemma == 0:
return self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth)
else:
return self.vocab.strings[self.c.lemma]
return self.vocab.strings[self.c.lemma]
def __set__(self, unicode lemma_):
self.c.lemma = self.vocab.strings.add(lemma_)

View File

@ -9,11 +9,10 @@ from .lexeme cimport EMPTY_LEXEME, OOV_RANK
from .lexeme cimport Lexeme
from .typedefs cimport attr_t
from .tokens.token cimport Token
from .attrs cimport LANG, ORTH, TAG, POS
from .attrs cimport LANG, ORTH
from .compat import copy_reg
from .errors import Errors
from .lemmatizer import Lemmatizer
from .attrs import intify_attrs, NORM, IS_STOP
from .vectors import Vectors
from .util import registry
@ -23,7 +22,7 @@ from .lang.norm_exceptions import BASE_NORMS
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_data=True):
def create_vocab(lang, defaults, vectors_name=None, load_data=True):
# If the spacy-lookups-data package is installed, we pre-populate the lookups
# with lexeme data, if available
if load_data:
@ -43,7 +42,6 @@ def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_data=T
)
return Vocab(
lex_attr_getters=lex_attrs,
lemmatizer=lemmatizer,
lookups=lookups,
writing_system=defaults.writing_system,
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
@ -58,17 +56,13 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab
"""
def __init__(self, lex_attr_getters=None, lemmatizer=None,
strings=tuple(), lookups=None, tag_map={},
def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
oov_prob=-20., vectors_name=None, writing_system={},
get_noun_chunks=None, **deprecated_kwargs):
"""Create the vocabulary.
lex_attr_getters (dict): A dictionary mapping attribute IDs to
functions to compute them. Defaults to `None`.
tag_map (dict): Dictionary mapping fine-grained tags to coarse-grained
parts-of-speech, and optionally morphological attributes.
lemmatizer (object): A lemmatizer. Defaults to `None`.
strings (StringStore): StringStore that maps strings to integers, and
vice versa.
lookups (Lookups): Container for large lookup tables and dictionaries.
@ -78,8 +72,6 @@ cdef class Vocab:
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
if lookups in (None, True, False):
lookups = Lookups()
if lemmatizer in (None, True, False):
lemmatizer = Lemmatizer(lookups)
self.cfg = {'oov_prob': oov_prob}
self.mem = Pool()
self._by_orth = PreshMap()
@ -89,7 +81,7 @@ cdef class Vocab:
for string in strings:
_ = self[string]
self.lex_attr_getters = lex_attr_getters
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
self.morphology = Morphology(self.strings)
self.vectors = Vectors(name=vectors_name)
self.lookups = lookups
self.writing_system = writing_system
@ -268,12 +260,6 @@ cdef class Vocab:
# Set the special tokens up to have arbitrary attributes
lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
token.lex = lex
if TAG in props:
self.morphology.assign_tag(token, props[TAG])
elif POS in props:
# Don't allow POS to be set without TAG -- this causes problems,
# see #1773
props.pop(POS)
for attr_id, value in props.items():
Token.set_struct_attr(token, attr_id, value)
# NORM is the only one that overlaps between the two

View File

@ -15,37 +15,194 @@ TODO: intro and how architectures work, link to
[`registry`](/api/top-level#registry),
[custom models](/usage/training#custom-models) usage etc.
## Tok2Vec architectures {#tok2vec source="spacy/ml/models/tok2vec.py"}
## Tok2Vec architectures {#tok2vec-arch source="spacy/ml/models/tok2vec.py"}
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN}
<!-- TODO: intro -->
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.HashEmbedCNN.v1"
> # TODO: ...
>
> [model.tok2vec]
> # ...
> pretrained_vectors = null
> width = 96
> depth = 4
> embed_size = 2000
> window_size = 1
> maxout_pieces = 3
> subword_features = true
> ```
| Name | Type | Description |
| -------------------- | ----- | ----------- |
| `width` | int | |
| `depth` | int | |
| `embed_size` | int | |
| `window_size` | int | |
| `maxout_pieces` | int | |
| `subword_features` | bool | |
| `dropout` | float | |
| `pretrained_vectors` | bool | |
Build spaCy's 'standard' tok2vec layer, which uses hash embedding with subword
features and a CNN with layer-normalized maxout.
### spacy.HashCharEmbedCNN.v1 {#HashCharEmbedCNN}
| Name | Type | Description |
| -------------------- | ---- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `width` | int | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. |
| `depth` | int | The number of convolutional layers to use. Recommended values are between `2` and `8`. |
| `embed_size` | int | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. |
| `window_size` | int | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. |
| `maxout_pieces` | int | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. |
| `subword_features` | bool | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. |
| `pretrained_vectors` | bool | Whether to also use static vectors. |
### spacy.HashCharEmbedBiLSTM.v1 {#HashCharEmbedBiLSTM}
### spacy.Tok2Vec.v1 {#Tok2Vec}
<!-- TODO: example config -->
> #### Example config
>
> ```ini
> [model]
> @architectures = "spacy.Tok2Vec.v1"
>
> [model.embed]
>
> [model.encode]
> ```
Construct a tok2vec model out of embedding and encoding subnetworks. See the
["Embed, Encode, Attend, Predict"](https://explosion.ai/blog/deep-learning-formula-nlp)
blog post for background.
| Name | Type | Description |
| -------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `embed` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Embed tokens into context-independent word vector representations. |
| `encode` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Floats2d]`. **Output:** `List[Floats2d]`. Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. |
### spacy.MultiHashEmbed.v1 {#MultiHashEmbed}
<!-- TODO: check example config -->
> #### Example config
>
> ```ini
> [model]
> @architectures = "spacy.MultiHashEmbed.v1"
> width = 64
> rows = 2000
> also_embed_subwords = false
> also_use_static_vectors = false
> ```
Construct an embedding layer that separately embeds a number of lexical
attributes using hash embedding, concatenates the results, and passes it through
a feed-forward subnetwork to build a mixed representations. The features used
are the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying
definitions depending on the `Vocab` of the `Doc` object passed in. Vectors from
pretrained static vectors can also be incorporated into the concatenated
representation.
| Name | Type | Description |
| ------------------------- | ---- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `width` | int | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. |
| `rows` | int | The number of rows for the embedding tables. Can be low, due to the hashing trick. Embeddings for prefix, suffix and word shape use half as many rows. Recommended values are between `2000` and `10000`. |
| `also_embed_subwords` | bool | Whether to use the `PREFIX`, `SUFFIX` and `SHAPE` features in the embeddings. If not using these, you may need more rows in your hash embeddings, as there will be increased chance of collisions. |
| `also_use_static_vectors` | bool | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. |
### spacy.CharacterEmbed.v1 {#CharacterEmbed}
<!-- TODO: check example config -->
> #### Example config
>
> ```ini
> [model]
> @architectures = "spacy.CharacterEmbed.v1"
> width = 64
> rows = 2000
> nM = 16
> nC = 4
> ```
Construct an embedded representations based on character embeddings, using a
feed-forward network. A fixed number of UTF-8 byte characters are used for each
word, taken from the beginning and end of the word equally. Padding is used in
the center for words that are too short.
For instance, let's say `nC=4`, and the word is "jumping". The characters used
will be `"jung"` (two from the start, two from the end). If we had `nC=8`, the
characters would be `"jumpping"`: 4 from the start, 4 from the end. This ensures
that the final character is always in the last position, instead of being in an
arbitrary position depending on the word length.
The characters are embedded in a embedding table with 256 rows, and the vectors
concatenated. A hash-embedded vector of the `NORM` of the word is also
concatenated on, and the result is then passed through a feed-forward network to
construct a single vector to represent the information.
| Name | Type | Description |
| ------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `width` | int | The width of the output vector and the `NORM` hash embedding. |
| `rows` | int | The number of rows in the `NORM` hash embedding table. |
| `nM` | int | The dimensionality of the character embeddings. Recommended values are between `16` and `64`. |
| `nC` | int | The number of UTF-8 bytes to embed per word. Recommended values are between `3` and `8`, although it may depend on the length of words in the language. |
### spacy.MaxoutWindowEncoder.v1 {#MaxoutWindowEncoder}
> #### Example config
>
> ```ini
> [model]
> @architectures = "spacy.MaxoutWindowEncoder.v1"
> width = 64
> window_size = 1
> maxout_pieces = 2
> depth = 4
> ```
Encode context using convolutions with maxout activation, layer normalization
and residual connections.
| Name | Type | Description |
| --------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `width` | int | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. |
| `window_size` | int | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. |
| `maxout_pieces` | int | The number of maxout pieces to use. Recommended values are `2` or `3`. |
| `depth` | int | The number of convolutional layers. Recommended value is `4`. |
### spacy.MishWindowEncoder.v1 {#MishWindowEncoder}
> #### Example config
>
> ```ini
> [model]
> @architectures = "spacy.MishWindowEncoder.v1"
> width = 64
> window_size = 1
> depth = 4
> ```
Encode context using convolutions with
[`Mish`](https://thinc.ai/docs/api-layers#mish) activation, layer normalization
and residual connections.
| Name | Type | Description |
| ------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `width` | int | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. |
| `window_size` | int | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. |
| `depth` | int | The number of convolutional layers. Recommended value is `4`. |
### spacy.TorchBiLSTMEncoder.v1 {#TorchBiLSTMEncoder}
> #### Example config
>
> ```ini
> [model]
> @architectures = "spacy.TorchBiLSTMEncoder.v1"
> width = 64
> window_size = 1
> depth = 4
> ```
Encode context using bidirectonal LSTM layers. Requires
[PyTorch](https://pytorch.org).
| Name | Type | Description |
| ------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `width` | int | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. |
| `window_size` | int | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. |
| `depth` | int | The number of convolutional layers. Recommended value is `4`. |
## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
@ -98,9 +255,9 @@ architectures into your training config.
| `grad_factor` | float | Factor for weighting the gradient if multiple components listen to the same transformer model. |
| `pooling` | `Model[Ragged, Floats2d]` | Pooling layer to determine how the vector for each spaCy token will be computed. |
## Parser & NER architectures {#parser source="spacy/ml/models/parser.py"}
## Parser & NER architectures {#parser}
### spacy.TransitionBasedParser.v1 {#TransitionBasedParser}
### spacy.TransitionBasedParser.v1 {#TransitionBasedParser source="spacy/ml/models/parser.py"}
> #### Example Config
>
@ -112,24 +269,100 @@ architectures into your training config.
> maxout_pieces = 2
>
> [model.tok2vec]
> # ...
> @architectures = "spacy.HashEmbedCNN.v1"
> pretrained_vectors = null
> width = 96
> depth = 4
> embed_size = 2000
> window_size = 1
> maxout_pieces = 3
> subword_features = true
> ```
| Name | Type | Description |
| ------------------- | ------------------------------------------ | ----------- |
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | |
| `nr_feature_tokens` | int | |
| `hidden_width` | int | |
| `maxout_pieces` | int | |
| `use_upper` | bool | |
| `nO` | int | |
Build a transition-based parser model. Can apply to NER or dependency-parsing.
Transition-based parsing is an approach to structured prediction where the task
of predicting the structure is mapped to a series of state transitions. You
might find [this tutorial](https://explosion.ai/blog/parsing-english-in-python)
helpful for background information. The neural network state prediction model
consists of either two or three subnetworks:
- **tok2vec**: Map each token into a vector representations. This subnetwork is
run once for each batch.
- **lower**: Construct a feature-specific vector for each `(token, feature)`
pair. This is also run once for each batch. Constructing the state
representation is then simply a matter of summing the component features and
applying the non-linearity.
- **upper** (optional): A feed-forward network that predicts scores from the
state representation. If not present, the output from the lower model is used
as action scores directly.
| Name | Type | Description |
| ------------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Subnetwork to map tokens into vector representations. |
| `nr_feature_tokens` | int | The number of tokens in the context to use to construct the state vector. Valid choices are `1`, `2`, `3`, `6`, `8` and `13`. The `2`, `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. |
| `hidden_width` | int | The width of the hidden layer. |
| `maxout_pieces` | int | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. |
| `use_upper` | bool | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. |
| `nO` | int | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. |
### spacy.BILUOTagger.v1 {#BILUOTagger source="spacy/ml/models/simple_ner.py"}
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.BILUOTagger.v1 "
>
> [model.tok2vec]
> @architectures = "spacy.HashEmbedCNN.v1"
> # etc.
> ```
Construct a simple NER tagger that predicts
[BILUO](/usage/linguistic-features#accessing-ner) tag scores for each token and
uses greedy decoding with transition-constraints to return a valid BILUO tag
sequence. A BILUO tag sequence encodes a sequence of non-overlapping labelled
spans into tags assigned to each token. The first token of a span is given the
tag `B-LABEL`, the last token of the span is given the tag `L-LABEL`, and tokens
within the span are given the tag `U-LABEL`. Single-token spans are given the
tag `U-LABEL`. All other tokens are assigned the tag `O`. The BILUO tag scheme
generally results in better linear separation between classes, especially for
non-CRF models, because there are more distinct classes for the different
situations ([Ratinov et al., 2009](https://www.aclweb.org/anthology/W09-1119/)).
| Name | Type | Description |
| --------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------- |
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Subnetwork to map tokens into vector representations. |
### spacy.IOBTagger.v1 {#IOBTagger source="spacy/ml/models/simple_ner.py"}
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.IOBTagger.v1 "
>
> [model.tok2vec]
> @architectures = "spacy.HashEmbedCNN.v1"
> # etc.
> ```
Construct a simple NER tagger, that predicts
[IOB](/usage/linguistic-features#accessing-ner) tag scores for each token and
uses greedy decoding with transition-constraints to return a valid IOB tag
sequence. An IOB tag sequence encodes a sequence of non-overlapping labeled
spans into tags assigned to each token. The first token of a span is given the
tag B-LABEL, and subsequent tokens are given the tag I-LABEL. All other tokens
are assigned the tag O.
| Name | Type | Description |
| --------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------- |
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Subnetwork to map tokens into vector representations. |
## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
### spacy.Tagger.v1 {#Tagger}
<!-- TODO: intro -->
> #### Example Config
>
> ```ini
@ -141,26 +374,143 @@ architectures into your training config.
> # ...
> ```
| Name | Type | Description |
| --------- | ------------------------------------------ | ----------- |
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | |
| `nO` | int | |
Build a tagger model, using a provided token-to-vector component. The tagger
model simply adds a linear layer with softmax activation to predict scores given
the token vectors.
| Name | Type | Description |
| --------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------- |
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Subnetwork to map tokens into vector representations. |
| `nO` | int | The number of tags to output. Inferred from the data if `None`. |
## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"}
A text classification architecture needs to take a [`Doc`](/api/doc) as input,
and produce a score for each potential label class. Textcat challenges can be
binary (e.g. sentiment analysis) or involve multiple possible labels.
Multi-label challenges can either have mutually exclusive labels (each example
has exactly one label), or multiple labels may be applicable at the same time.
As the properties of text classification problems can vary widely, we provide
several different built-in architectures. It is recommended to experiment with
different architectures and settings to determine what works best on your
specific data and challenge.
### spacy.TextCatEnsemble.v1 {#TextCatEnsemble}
Stacked ensemble of a bag-of-words model and a neural network model. The neural
network has an internal CNN Tok2Vec layer and uses attention.
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.TextCatEnsemble.v1"
> exclusive_classes = false
> pretrained_vectors = null
> width = 64
> embed_size = 2000
> conv_depth = 2
> window_size = 1
> ngram_size = 1
> dropout = null
> nO = null
> ```
| Name | Type | Description |
| -------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. |
| `pretrained_vectors` | bool | Whether or not pretrained vectors will be used in addition to the feature vectors. |
| `width` | int | Output dimension of the feature encoding step. |
| `embed_size` | int | Input dimension of the feature encoding step. |
| `conv_depth` | int | Depth of the Tok2Vec layer. |
| `window_size` | int | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. |
| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. |
| `dropout` | float | The dropout rate. |
| `nO` | int | Output dimension, determined by the number of different labels. |
If the `nO` dimension is not set, the TextCategorizer component will set it when
`begin_training` is called.
### spacy.TextCatCNN.v1 {#TextCatCNN}
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.TextCatCNN.v1"
> exclusive_classes = false
> nO = null
>
> [model.tok2vec]
> @architectures = "spacy.HashEmbedCNN.v1"
> pretrained_vectors = null
> width = 96
> depth = 4
> embed_size = 2000
> window_size = 1
> maxout_pieces = 3
> subword_features = true
> ```
A neural network model where token vectors are calculated using a CNN. The
vectors are mean pooled and used as features in a feed-forward network. This
architecture is usually less accurate than the ensemble, but runs faster.
| Name | Type | Description |
| ------------------- | ------------------------------------------ | --------------------------------------------------------------- |
| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. |
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | The [`tok2vec`](#tok2vec) layer of the model. |
| `nO` | int | Output dimension, determined by the number of different labels. |
If the `nO` dimension is not set, the TextCategorizer component will set it when
`begin_training` is called.
### spacy.TextCatBOW.v1 {#TextCatBOW}
### spacy.TextCatCNN.v1 {#TextCatCNN}
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.TextCatBOW.v1"
> exclusive_classes = false
> ngram_size = 1
> no_output_layer = false
> nO = null
> ```
An ngram "bag-of-words" model. This architecture should run much faster than the
others, but may not be as accurate, especially if texts are short.
| Name | Type | Description |
| ------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. |
| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. |
| `no_output_layer` | float | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes=True`, else `Logistic`. |
| `nO` | int | Output dimension, determined by the number of different labels. |
If the `nO` dimension is not set, the TextCategorizer component will set it when
`begin_training` is called.
### spacy.TextCatLowData.v1 {#TextCatLowData}
## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
An [`EntityLinker`](/api/entitylinker) component disambiguates textual mentions
(tagged as named entities) to unique identifiers, grounding the named entities
into the "real world". This requires 3 main components:
- A [`KnowledgeBase`](/api/kb) (KB) holding the unique identifiers, potential
synonyms and prior probabilities.
- A candidate generation step to produce a set of likely identifiers, given a
certain textual mention.
- A Machine learning [`Model`](https://thinc.ai/docs/api-model) that picks the
most plausible ID from the set of candidates.
### spacy.EntityLinker.v1 {#EntityLinker}
<!-- TODO: intro -->
The `EntityLinker` model architecture is a `Thinc` `Model` with a Linear output
layer.
> #### Example Config
>
@ -170,10 +520,46 @@ architectures into your training config.
> nO = null
>
> [model.tok2vec]
> # ...
> @architectures = "spacy.HashEmbedCNN.v1"
> pretrained_vectors = null
> width = 96
> depth = 2
> embed_size = 300
> window_size = 1
> maxout_pieces = 3
> subword_features = true
>
> [kb_loader]
> @assets = "spacy.EmptyKB.v1"
> entity_vector_length = 64
>
> [get_candidates]
> @assets = "spacy.CandidateGenerator.v1"
> ```
| Name | Type | Description |
| --------- | ------------------------------------------ | ----------- |
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | |
| `nO` | int | |
| Name | Type | Description |
| --------- | ------------------------------------------ | ---------------------------------------------------------------------------------------- |
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | The [`tok2vec`](#tok2vec) layer of the model. |
| `nO` | int | Output dimension, determined by the length of the vectors encoding each entity in the KB |
If the `nO` dimension is not set, the Entity Linking component will set it when
`begin_training` is called.
### spacy.EmptyKB.v1 {#EmptyKB}
A function that creates a default, empty `KnowledgeBase` from a
[`Vocab`](/api/vocab) instance.
| Name | Type | Description |
| ---------------------- | ---- | ------------------------------------------------------------------------- |
| `entity_vector_length` | int | The length of the vectors encoding each entity in the KB - 64 by default. |
### spacy.CandidateGenerator.v1 {#CandidateGenerator}
A function that takes as input a [`KnowledgeBase`](/api/kb) and a
[`Span`](/api/span) object denoting a named entity, and returns a list of
plausible [`Candidate` objects](/api/kb/#candidate_init).
The default `CandidateGenerator` simply uses the text of a mention to find its
potential aliases in the Knowledgebase. Note that this function is
case-dependent.

View File

@ -0,0 +1,245 @@
---
title: AttributeRuler
tag: class
source: spacy/pipeline/attributeruler.py
new: 3
teaser: 'Pipeline component for rule-based token attribute assignment'
api_string_name: attribute_ruler
api_trainable: false
---
The attribute ruler lets you set token attributes for tokens identified by
[`Matcher` patterns](/usage/rule-based-matching#matcher). The attribute ruler is
typically used to handle exceptions for token attributes and to map values
between attributes such as mapping fine-grained POS tags to coarse-grained POS
tags.
## Config and implementation {#config}
The default config is defined by the pipeline component factory and describes
how the component should be configured. You can override its settings via the
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
[`config.cfg` for training](/usage/training#config).
> #### Example
>
> ```python
> config = {
> "pattern_dicts": None,
> "validate": True,
> }
> nlp.add_pipe("attribute_ruler", config=config)
> ```
| Setting | Type | Description | Default |
| --------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------- | ------- |
| `pattern_dicts` | `Iterable[dict]` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](#add) (`patterns`/`attrs`/`index`) to add as patterns. | `None` |
| `validate` | bool | Whether patterns should be validated (passed to the `Matcher`). | `False` |
```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/attributeruler.py
```
## AttributeRuler.\_\_init\_\_ {#init tag="method"}
Initialize the attribute ruler. If pattern dicts are supplied here, they need to
be a list of dictionaries with `"patterns"`, `"attrs"`, and optional `"index"`
keys, e.g.:
```python
pattern_dicts = \[
{"patterns": \[\[{"TAG": "VB"}\]\], "attrs": {"POS": "VERB"}},
{"patterns": \[\[{"LOWER": "an"}\]\], "attrs": {"LEMMA": "a"}},
\]
```
> #### Example
>
> ```python
> # Construction via add_pipe
> attribute_ruler = nlp.add_pipe("attribute_ruler")
> ```
| Name | Type | Description |
| --------------- | ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. |
| `name` | str | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. |
| _keyword-only_ | | |
| `pattern_dicts` | `Iterable[Dict]]` | Optional patterns to load in on initialization. Defaults to `None`. |
| `validate` | bool | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. |
## AttributeRuler.\_\_call\_\_ {#call tag="method"}
Apply the attribute ruler to a Doc, setting token attributes for tokens matched
by the provided patterns.
| Name | Type | Description |
| ----------- | ----- | ------------------------------------------------------------ |
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
| **RETURNS** | `Doc` | The modified `Doc` with added entities, if available. |
## AttributeRuler.add {#add tag="method"}
Add patterns to the attribute ruler. The patterns are a list of `Matcher`
patterns and the attributes are a dict of attributes to set on the matched
token. If the pattern matches a span of more than one token, the `index` can be
used to set the attributes for the token at that index in the span. The `index`
may be negative to index from the end of the span.
> #### Example
>
> ```python
> attribute_ruler = nlp.add_pipe("attribute_ruler")
> patterns = [[{"TAG": "VB"}]]
> attrs = {"POS": "VERB"}
> attribute_ruler.add(patterns=patterns, attrs=attrs)
> ```
| Name | Type | Description |
| -------- | ---------------------- | ----------------------------------------------------------------------------------------------------------------------- |
| patterns | `Iterable[List[Dict]]` | A list of Matcher patterns. |
| attrs | dict | The attributes to assign to the target token in the matched span. |
| index | int | The index of the token in the matched span to modify. May be negative to index from the end of the span. Defaults to 0. |
## AttributeRuler.add_patterns {#add_patterns tag="method"}
> #### Example
>
> ```python
> attribute_ruler = nlp.add_pipe("attribute_ruler")
> pattern_dicts = \[
> {
> "patterns": \[\[{"TAG": "VB"}\]\],
> "attrs": {"POS": "VERB"}
> },
> {
> "patterns": \[\[{"LOWER": "two"}, {"LOWER": "apples"}\]\],
> "attrs": {"LEMMA": "apple"},
> "index": -1
> },
> \]
> attribute_ruler.add_patterns(pattern_dicts)
> ```
Add patterns from a list of pattern dicts with the keys as the arguments to
[`AttributeRuler.add`](#add).
| Name | Type | Description |
| --------------- | ----------------- | -------------------- |
| `pattern_dicts` | `Iterable[Dict]]` | The patterns to add. |
## AttributeRuler.patterns {#patterns tag="property"}
Get all patterns that have been added to the attribute ruler in the
`patterns_dict` format accepted by
[`AttributeRuler.add_patterns`](#add_patterns).
| Name | Type | Description |
| ----------- | ------------ | ------------------------------------------ |
| **RETURNS** | `List[dict]` | The patterns added to the attribute ruler. |
## AttributeRuler.load_from_tag_map {#load_from_tag_map tag="method"}
Load attribute ruler patterns from a tag map.
| Name | Type | Description |
| --------- | ---- | ------------------------------------------------------------------------------------------ |
| `tag_map` | dict | The tag map that maps fine-grained tags to coarse-grained tags and morphological features. |
## AttributeRuler.load_from_morph_rules {#load_from_morph_rules tag="method"}
Load attribute ruler patterns from morph rules.
| Name | Type | Description |
| ------------- | ---- | -------------------------------------------------------------------------------------------------------------------- |
| `morph_rules` | dict | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. |
## AttributeRuler.to_disk {#to_disk tag="method"}
Serialize the pipe to disk.
> #### Example
>
> ```python
> attribute_ruler = nlp.add_pipe("attribute_ruler")
> attribute_ruler.to_disk("/path/to/attribute_ruler")
> ```
| Name | Type | Description |
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| _keyword-only_ | | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
## AttributeRuler.from_disk {#from_disk tag="method"}
Load the pipe from disk. Modifies the object in place and returns it.
> #### Example
>
> ```python
> attribute_ruler = nlp.add_pipe("attribute_ruler")
> attribute_ruler.from_disk("/path/to/attribute_ruler")
> ```
| Name | Type | Description |
| -------------- | ---------------- | -------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| _keyword-only_ | | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `AttributeRuler` | The modified `AttributeRuler` object. |
## AttributeRuler.to_bytes {#to_bytes tag="method"}
> #### Example
>
> ```python
> attribute_ruler = nlp.add_pipe("attribute_ruler")
> attribute_ruler_bytes = attribute_ruler.to_bytes()
> ```
Serialize the pipe to a bytestring.
| Name | Type | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- |
| _keyword-only_ | | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | bytes | The serialized form of the `AttributeRuler` object. |
## AttributeRuler.from_bytes {#from_bytes tag="method"}
Load the pipe from a bytestring. Modifies the object in place and returns it.
> #### Example
>
> ```python
> attribute_ruler_bytes = attribute_ruler.to_bytes()
> attribute_ruler = nlp.add_pipe("attribute_ruler")
> attribute_ruler.from_bytes(attribute_ruler_bytes)
> ```
| Name | Type | Description |
| -------------- | ---------------- | ------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. |
| _keyword-only_ | | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `AttributeRuler` | The `AttributeRuler` object. |
## Serialization fields {#serialization-fields}
During serialization, spaCy will export several data fields used to restore
different aspects of the object. If needed, you can exclude them from
serialization by passing in the string names via the `exclude` argument.
> #### Example
>
> ```python
> data = attribute_ruler.to_disk("/path", exclude=["vocab"])
> ```
| Name | Description |
| ---------- | -------------------------------------------------------------- |
| `vocab` | The shared [`Vocab`](/api/vocab). |
| `patterns` | The Matcher patterns. You usually don't want to exclude this. |
| `attrs` | The attributes to set. You usually don't want to exclude this. |
| `indices` | The token indices. You usually don't want to exclude this. |

View File

@ -132,7 +132,7 @@ $ python -m spacy init config [output] [--base] [--lang] [--model] [--pipeline]
| `--base`, `-b` | option | Optional base config file to auto-fill with defaults. |
| `--lang`, `-l` | option | Optional language code to use for blank config. If a `--pipeline` is specified, the components will be added in order. |
| `--model`, `-m` | option | Optional base model to copy config from. If a `--pipeline` is specified, only those components will be kept, and all other components not in the model will be added. |
| `--pipeline`, `-p` | option | Optional comma-separate pipeline of components to add to blank language or model. |
| `--pipeline`, `-p` | option | Optional comma-separated pipeline of components to add to blank language or model. |
| **CREATES** | config | Complete and auto-filled config file for training. |
### init model {#init-model new="2"}
@ -202,7 +202,7 @@ $ python -m spacy convert [input_file] [output_dir] [--converter]
| ID | Description |
| ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `auto` | Automatically pick converter based on file extension and file content (default). |
| `json` | JSON-formatted training data used in spaCy v2.x and produced by [`docs2json`](/api/top-level#docs_to_json). |
| `json` | JSON-formatted training data used in spaCy v2.x. |
| `conll` | Universal Dependencies `.conllu` or `.conll` format. |
| `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
@ -219,23 +219,22 @@ The command will create all objects in the tree and validate them. Note that
some config validation errors are blocking and will prevent the rest of the
config from being resolved. This means that you may not see all validation
errors at once and some issues are only shown once previous errors have been
fixed.
Instead of specifying all required settings in the config file, you can rely on
an auto-fill functionality that uses spaCy's built-in defaults. The resulting
full config can be written to file and used in downstream training tasks.
fixed. To auto-fill a partial config and save the result, you can use the
[`init config`](/api/cli#init-config) command.
```bash
$ python -m spacy debug config [config_path] [--code_path] [--output] [--auto_fill] [--diff] [overrides]
```
> #### Example 1
> #### Example
>
> ```bash
> $ python -m spacy debug config ./config.cfg
> ```
<Accordion title="Example 1 output" spaced>
<Accordion title="Example output" spaced>
<!-- TODO: update examples with validation error of final config -->
```
✘ Config validation error
@ -254,30 +253,15 @@ training -> width extra fields not permitted
</Accordion>
> #### Example 2
>
> ```bash
> $ python -m spacy debug config ./minimal_config.cfg -F -o ./filled_config.cfg
> ```
<Accordion title="Example 2 output" spaced>
```
✔ Auto-filled config is valid
✔ Saved updated config to ./filled_config.cfg
```
</Accordion>
| Argument | Type | Default | Description |
| --------------------- | ---------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `config_path` | positional | - | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
| `--code_path`, `-c` | option | `None` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
| `--auto_fill`, `-F` | option | `False` | Whether or not to auto-fill the config with built-in defaults if possible. If `False`, the provided config needs to be complete. |
| `--output_path`, `-o` | option | `None` | Output path where the filled config can be stored. Use '-' for standard output. |
| `--diff`, `-D` | option | `False` | Show a visual diff if config was auto-filled. |
| `--help`, `-h` | flag | `False` | Show help message and available arguments. |
| overrides | | `None` | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
| Argument | Type | Default | Description |
| --------------------- | ---------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `config_path` | positional | - | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
| `--code_path`, `-c` | option | `None` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
| `--auto_fill`, `-F` | option | `False` | Whether or not to auto-fill the config with built-in defaults if possible. If `False`, the provided config needs to be complete. |
| `--output_path`, `-o` | option | `None` | Output path where the filled config can be stored. Use '-' for standard output. |
| `--diff`, `-D` | option | `False` | Show a visual diff if config was auto-filled. |
| `--help`, `-h` | flag | `False` | Show help message and available arguments. |
| overrides | | `None` | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. |
### debug data {#debug-data}
@ -287,21 +271,22 @@ low data labels and more.
<Infobox title="New in v3.0" variant="warning">
The `debug-data` command is now available as a subcommand of `spacy debug`. It
The `debug data` command is now available as a subcommand of `spacy debug`. It
takes the same arguments as `train` and reads settings off the
[`config.cfg` file](/usage/training#config).
[`config.cfg` file](/usage/training#config) and optional
[overrides](/usage/training#config-overrides) on the CLI.
</Infobox>
```bash
$ python -m spacy debug data [train_path] [dev_path] [config_path] [--code]
[--ignore-warnings] [--verbose] [--no-format] [overrides]
$ python -m spacy debug data [config_path] [--code] [--ignore-warnings]
[--verbose] [--no-format] [overrides]
```
> #### Example
>
> ```bash
> $ python -m spacy debug data ./train.spacy ./dev.spacy ./config.cfg
> $ python -m spacy debug data ./config.cfg
> ```
<Accordion title="Example output" spaced>
@ -443,17 +428,15 @@ will not be available.
</Accordion>
| Argument | Type | Description |
| -------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `train_path` | positional | Location of [binary training data](/usage/training#data-format). Can be a file or a directory of files. |
| `dev_path` | positional | Location of [binary development data](/usage/training#data-format) for evaluation. Can be a file or a directory of files. |
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. |
| `--verbose`, `-V` | flag | Print additional information and explanations. |
| `--no-format`, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
| Argument | Type | Description |
| -------------------------- | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. |
| `--verbose`, `-V` | flag | Print additional information and explanations. |
| `--no-format`, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. |
<!-- TODO: document debug profile?-->
@ -463,16 +446,20 @@ Debug a Thinc [`Model`](https://thinc.ai/docs/api-model) by running it on a
sample text and checking how it updates its internal weights and parameters.
```bash
$ python -m spacy debug model [config_path] [component] [--layers] [-DIM] [-PAR] [-GRAD] [-ATTR] [-P0] [-P1] [-P2] [P3] [--gpu_id]
$ python -m spacy debug model [config_path] [component] [--layers] [-DIM]
[-PAR] [-GRAD] [-ATTR] [-P0] [-P1] [-P2] [P3] [--gpu-id]
```
> #### Example 1
>
> ```bash
> $ python -m spacy debug model ./config.cfg tagger -P0
> ```
<Accordion title="Example outputs" spaced>
<Accordion title="Example 1 output" spaced>
In this example log, we just print the name of each layer after creation of the
model ("Step 0"), which helps us to understand the internal structure of the
Neural Network, and to focus on specific layers that we want to inspect further
(see next example).
```bash
$ python -m spacy debug model ./config.cfg tagger -P0
```
```
Using CPU
@ -509,20 +496,16 @@ $ python -m spacy debug model [config_path] [component] [--layers] [-DIM] [-PAR]
...
```
</Accordion>
In this example log, we see how initialization of the model (Step 1) propagates
the correct values for the `nI` (input) and `nO` (output) dimensions of the
various layers. In the `softmax` layer, this step also defines the `W` matrix as
an all-zero matrix determined by the `nO` and `nI` dimensions. After a first
training step (Step 2), this matrix has clearly updated its values through the
training feedback loop.
In this example log, we just print the name of each layer after creation of the
model ("Step 0"), which helps us to understand the internal structure of the
Neural Network, and to focus on specific layers that we want to inspect further
(see next example).
> #### Example 2
>
> ```bash
> $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P2
> ```
<Accordion title="Example 2 output" spaced>
```bash
$ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P2
```
```
Using CPU
@ -563,27 +546,20 @@ Neural Network, and to focus on specific layers that we want to inspect further
</Accordion>
In this example log, we see how initialization of the model (Step 1) propagates
the correct values for the `nI` (input) and `nO` (output) dimensions of the
various layers. In the `softmax` layer, this step also defines the `W` matrix as
an all-zero matrix determined by the `nO` and `nI` dimensions. After a first
training step (Step 2), this matrix has clearly updated its values through the
training feedback loop.
| Argument | Type | Default | Description |
| ----------------------- | ---------- | ------- | ---------------------------------------------------------------------------------------------------- |
| Argument | Type | Default | Description |
| ----------------------- | ---------- | ------- | ----------------------------------------------------------------------------------------------------- |
| `config_path` | positional | | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
| `component` | positional | | Name of the pipeline component of which the model should be analysed. |
| `--layers`, `-l` | option | | Comma-separated names of layer IDs to print. |
| `--dimensions`, `-DIM` | option | `False` | Show dimensions of each layer. |
| `--parameters`, `-PAR` | option | `False` | Show parameters of each layer. |
| `--gradients`, `-GRAD` | option | `False` | Show gradients of each layer. |
| `--attributes`, `-ATTR` | option | `False` | Show attributes of each layer. |
| `--print-step0`, `-P0` | option | `False` | Print model before training. |
| `--print-step1`, `-P1` | option | `False` | Print model after initialization. |
| `--print-step2`, `-P2` | option | `False` | Print model after training. |
| `--print-step3`, `-P3` | option | `False` | Print final predictions. |
| `--help`, `-h` | flag | | Show help message and available arguments. |
| `component` | positional | | Name of the pipeline component of which the model should be analyzed. |
| `--layers`, `-l` | option | | Comma-separated names of layer IDs to print. |
| `--dimensions`, `-DIM` | option | `False` | Show dimensions of each layer. |
| `--parameters`, `-PAR` | option | `False` | Show parameters of each layer. |
| `--gradients`, `-GRAD` | option | `False` | Show gradients of each layer. |
| `--attributes`, `-ATTR` | option | `False` | Show attributes of each layer. |
| `--print-step0`, `-P0` | option | `False` | Print model before training. |
| `--print-step1`, `-P1` | option | `False` | Print model after initialization. |
| `--print-step2`, `-P2` | option | `False` | Print model after training. |
| `--print-step3`, `-P3` | option | `False` | Print final predictions. |
| `--help`, `-h` | flag | | Show help message and available arguments. |
## Train {#train}
@ -603,37 +579,39 @@ you need to manage complex multi-step training workflows, check out the new
The `train` command doesn't take a long list of command-line arguments anymore
and instead expects a single [`config.cfg` file](/usage/training#config)
containing all settings for the pipeline, training process and hyperparameters.
Config values can be [overwritten](/usage/training#config-overrides) on the CLI
if needed. For example, `--paths.train ./train.spacy` sets the variable `train`
in the section `[paths]`.
</Infobox>
```bash
$ python -m spacy train [train_path] [dev_path] [config_path] [--output]
[--code] [--verbose] [overrides]
$ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides]
```
| Argument | Type | Description |
| ----------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `train_path` | positional | Location of training data in spaCy's [binary format](/api/data-formats#training). Can be a file or a directory of files. |
| `dev_path` | positional | Location of development data for evaluation in spaCy's [binary format](/api/data-formats#training). Can be a file or a directory of files. |
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
| `--output`, `-o` | positional | Directory to store model in. Will be created if it doesn't exist. |
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
| `--verbose`, `-V` | flag | Show more detailed messages during training. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
| **CREATES** | model | The final model and the best model. |
| Argument | Type | Description |
| ----------------- | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
| `--output`, `-o` | positional | Directory to store model in. Will be created if it doesn't exist. |
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
| `--verbose`, `-V` | flag | Show more detailed messages during training. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. |
| **CREATES** | model | The final model and the best model. |
## Pretrain {#pretrain new="2.1" tag="experimental"}
<!-- TODO: document new pretrain command and link to new pretraining docs -->
Pre-train the "token to vector" (`tok2vec`) layer of pipeline components, using
an approximate language-modeling objective. Specifically, we load pretrained
vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which
match the pretrained ones. The weights are saved to a directory after each
epoch. You can then pass a path to one of these pretrained weights files to the
`spacy train` command. This technique may be especially helpful if you have
little labelled data.
Pre-train the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
components on [raw text](/api/data-formats#pretrain), using an approximate
language-modeling objective. Specifically, we load pretrained vectors, and train
a component like a CNN, BiLSTM, etc to predict vectors which match the
pretrained ones. The weights are saved to a directory after each epoch. You can
then include a **path to one of these pretrained weights files** in your
[training config](/usage/training#config) as the `init_tok2vec` setting when you
train your model. This technique may be especially helpful if you have little
labelled data.
<Infobox title="Changed in v3.0" variant="warning">
@ -650,63 +628,33 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path]
[--code] [--resume-path] [--epoch-resume] [overrides]
```
| Argument | Type | Description |
| ----------------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `texts_loc` | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](#pretrain-jsonl) for details. |
| `output_dir` | positional | Directory to write models to on each epoch. |
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
| `--resume-path`, `-r` | option | TODO: |
| `--epoch-resume`, `-er` | option | TODO: |
| `--help`, `-h` | flag | Show help message and available arguments. |
| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
| **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. |
### JSONL format for raw text {#pretrain-jsonl}
Raw text can be provided as a `.jsonl` (newline-delimited JSON) file containing
one input text per line (roughly paragraph length is good). Optionally, custom
tokenization can be provided.
> #### Tip: Writing JSONL
>
> Our utility library [`srsly`](https://github.com/explosion/srsly) provides a
> handy `write_jsonl` helper that takes a file path and list of dictionaries and
> writes out JSONL-formatted data.
>
> ```python
> import srsly
> data = [{"text": "Some text"}, {"text": "More..."}]
> srsly.write_jsonl("/path/to/text.jsonl", data)
> ```
| Key | Type | Description |
| -------- | ---- | ---------------------------------------------------------- |
| `text` | str | The raw input text. Is not required if `tokens` available. |
| `tokens` | list | Optional tokenization, one string per token. |
```json
### Example
{"text": "Can I ask where you work now and what you do, and if you enjoy it?"}
{"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."}
{"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."}
{"tokens": ["If", "tokens", "are", "provided", "then", "we", "can", "skip", "the", "raw", "input", "text"]}
```
| Argument | Type | Description |
| ----------------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `texts_loc` | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](/api/data-formats#pretrain) for details. |
| `output_dir` | positional | Directory to write models to on each epoch. |
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
| `--resume-path`, `-r` | option | TODO: |
| `--epoch-resume`, `-er` | option | TODO: |
| `--help`, `-h` | flag | Show help message and available arguments. |
| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
| **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. |
## Evaluate {#evaluate new="2"}
<!-- TODO: document new evaluate command -->
Evaluate a model's accuracy and speed on JSON-formatted annotated data. Will
print the results and optionally export
[displaCy visualizations](/usage/visualizers) of a sample set of parses to
`.html` files. Visualizations for the dependency parse and NER will be exported
as separate files if the respective component is present in the model's
pipeline.
Evaluate a model. Expects a loadable spaCy model and evaluation data in the
[binary `.spacy` format](/api/data-formats#binary-training). The
`--gold-preproc` option sets up the evaluation examples with gold-standard
sentences and tokens for the predictions. Gold preprocessing helps the
annotations align to the tokenization, and may result in sequences of more
consistent length. However, it may reduce runtime accuracy due to train/test
skew. To render a sample of dependency parses in a HTML file using the
[displaCy visualizations](/usage/visualizers), set as output directory as the
`--displacy-path` argument.
```bash
$ python -m spacy evaluate [model] [data_path] [--output] [--displacy-path]
[--displacy-limit] [--gpu-id] [--gold-preproc]
$ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc]
[--gpu-id] [--displacy-path] [--displacy-limit]
```
| Argument | Type | Description |
@ -714,10 +662,10 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--displacy-path]
| `model` | positional | Model to evaluate. Can be a package or a path to a model data directory. |
| `data_path` | positional | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). |
| `--output`, `-o` | option | Output JSON file for metrics. If not set, no metrics will be exported. |
| `--gold-preproc`, `-G` | flag | Use gold preprocessing. |
| `--gpu-id`, `-g` | option | GPU to use, if any. Defaults to `-1` for CPU. |
| `--displacy-path`, `-dp` | option | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. |
| `--displacy-limit`, `-dl` | option | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. |
| `--gpu-id`, `-g` | option | GPU to use, if any. Defaults to `-1` for CPU. |
| `--gold-preproc`, `-G` | flag | Use gold preprocessing. |
| **CREATES** | `stdout`, JSON, HTML | Training results and optional metrics and visualizations. |
## Package {#package}

View File

@ -9,7 +9,41 @@ new: 3
This class manages annotated corpora and can be used for training and
development datasets in the [DocBin](/api/docbin) (`.spacy`) format. To
customize the data loading during training, you can register your own
[data readers and batchers](/usage/training#custom-code-readers-batchers)
[data readers and batchers](/usage/training#custom-code-readers-batchers).
## Config and implementation {#config}
`spacy.Corpus.v1` is a registered function that creates a `Corpus` of training
or evaluation data. It takes the same arguments as the `Corpus` class and
returns a callable that yields [`Example`](/api/example) objects. You can
replace it with your own registered function in the
[`@readers` registry](/api/top-level#regsitry) to customize the data loading and
streaming.
> #### Example config
>
> ```ini
> [paths]
> train = "corpus/train.spacy"
>
> [training.train_corpus]
> @readers = "spacy.Corpus.v1"
> path = ${paths:train}
> gold_preproc = false
> max_length = 0
> limit = 0
> ```
| Name | Type | Description |
| --------------- | ------ | ----------------------------------------------------------------------------------------------------------------------------------------------- |
| `path` | `Path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). |
|  `gold_preproc` | bool | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. |
| `max_length` | int | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. |
| `limit` | int | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. |
```python
https://github.com/explosion/spaCy/blob/develop/spacy/gold/corpus.py
```
## Corpus.\_\_init\_\_ {#init tag="method"}

View File

@ -2,33 +2,222 @@
title: Data formats
teaser: Details on spaCy's input and output data formats
menu:
- ['Training Data', 'training']
- ['Training Config', 'config']
- ['Training Data', 'training']
- ['Pretraining Data', 'pretraining']
- ['Vocabulary', 'vocab']
---
This section documents input and output formats of data used by spaCy, including
training data and lexical vocabulary data. For an overview of label schemes used
by the models, see the [models directory](/models). Each model documents the
label schemes used in its components, depending on the data it was trained on.
the [training config](/usage/training#config), training data and lexical
vocabulary data. For an overview of label schemes used by the models, see the
[models directory](/models). Each model documents the label schemes used in its
components, depending on the data it was trained on.
## Training config {#config new="3"}
Config files define the training process and model pipeline and can be passed to
[`spacy train`](/api/cli#train). They use
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
hood. For details on how to use training configs, see the
[usage documentation](/usage/training#config).
<!-- TODO: add details on getting started and init config -->
> #### What does the @ mean?
>
> The `@` syntax lets you refer to function names registered in the
> [function registry](/api/top-level#registry). For example,
> `@architectures = "spacy.HashEmbedCNN.v1"` refers to a registered function of
> the name [spacy.HashEmbedCNN.v1](/api/architectures#HashEmbedCNN) and all
> other values defined in its block will be passed into that function as
> arguments. Those arguments depend on the registered function. See the usage
> guide on [registered functions](/usage/training#config-functions) for details.
```ini
https://github.com/explosion/spaCy/blob/develop/spacy/default_config.cfg
```
<Infobox title="Notes on data validation" emoji="💡">
Under the hood, spaCy's configs are powered by our machine learning library
[Thinc's config system](https://thinc.ai/docs/usage-config), which uses
[`pydantic`](https://github.com/samuelcolvin/pydantic/) for data validation
based on type hints. See
[`spacy/schemas.py`](https://github.com/explosion/spaCy/blob/develop/spacy/schemas.py)
for the schemas used to validate the default config. Arguments of registered
functions are validated against their type annotations, if available. To debug
your config and check that it's valid, you can run the
[`spacy debug config`](/api/cli#debug-config) command.
</Infobox>
<!-- TODO: once we know how we want to implement "starter config" workflow or outputting a full default config for the user, update this section with the command -->
### nlp {#config-nlp tag="section"}
> #### Example
>
> ```ini
> [nlp]
> lang = "en"
> pipeline = ["tagger", "parser", "ner"]
> load_vocab_data = true
> before_creation = null
> after_creation = null
> after_pipeline_creation = null
>
> [nlp.tokenizer]
> @tokenizers = "spacy.Tokenizer.v1"
> ```
Defines the `nlp` object, its tokenizer and
[processing pipeline](/usage/processing-pipelines) component names.
| Name | Type | Description | Default |
| ------------------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------- |
| `lang` | str | The language code to use. | `null` |
| `pipeline` | `List[str]` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). | `[]` |
| `load_vocab_data` | bool | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. | `true` |
| `before_creation` | callable | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. | `null` |
| `after_creation` | callable | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. | `null` |
| `after_pipeline_creation` | callable | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. | `null` |
| `tokenizer` | callable | The tokenizer to use. | [`Tokenizer`](/api/tokenizer) |
### components {#config-components tag="section"}
> #### Example
>
> ```ini
> [components.textcat]
> factory = "textcat"
> labels = ["POSITIVE", "NEGATIVE"]
>
> [components.textcat.model]
> @architectures = "spacy.TextCatBOW.v1"
> exclusive_classes = false
> ngram_size = 1
> no_output_layer = false
> ```
This section includes definitions of the
[pipeline components](/usage/processing-pipelines) and their models, if
available. Components in this section can be referenced in the `pipeline` of the
`[nlp]` block. Component blocks need to specify either a `factory` (named
function to use to create component) or a `source` (name of path of pretrained
model to copy components from). See the docs on
[defining pipeline components](/usage/training#config-components) for details.
### paths, system {#config-variables tag="variables"}
These sections define variables that can be referenced across the other sections
as variables. For example `${paths:train}` uses the value of `train` defined in
the block `[paths]`. If your config includes custom registered functions that
need paths, you can define them here. All config values can also be
[overwritten](/usage/training#config-overrides) on the CLI when you run
[`spacy train`](/api/cli#train), which is especially relevant for data paths
that you don't want to hard-code in your config file.
```bash
$ python -m spacy train ./config.cfg --paths.train ./corpus/train.spacy
```
### training {#config-training tag="section"}
This section defines settings and controls for the training and evaluation
process that are used when you run [`spacy train`](/api/cli#train).
<!-- TODO: complete -->
| Name | Type | Description | Default |
| --------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------- |
| `seed` | int | The random seed. | `${system:seed}` |
| `dropout` | float | The dropout rate. | `0.1` |
| `accumulate_gradient` | int | Whether to divide the batch up into substeps. | `1` |
| `init_tok2vec` | str | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). | `${paths:init_tok2vec}` |
| `raw_text` | str | | `${paths:raw}` |
| `vectors` | str | | `null` |
| `patience` | int | How many steps to continue without improvement in evaluation score. | `1600` |
| `max_epochs` | int | Maximum number of epochs to train for. | `0` |
| `max_steps` | int | Maximum number of update steps to train for. | `20000` |
| `eval_frequency` | int | How often to evaluate during training (steps). | `200` |
| `score_weights` | `Dict[str, float]` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. | `{}` |
| `frozen_components` | `List[str]` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. | `[]` |
| `train_corpus` | callable | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. | [`Corpus`](/api/corpus) |
| `dev_corpus` | callable | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. | [`Corpus`](/api/corpus) |
| `batcher` | callable | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. | [`batch_by_words`](/api/top-level#batch_by_words) |
| `optimizer` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. | [`Adam`](https://thinc.ai/docs/api-optimizers#adam) |
### pretraining {#config-pretraining tag="section,optional"}
This section is optional and defines settings and controls for
[language model pretraining](/usage/training#pretraining). It's used when you
run [`spacy pretrain`](/api/cli#pretrain).
<!-- TODO: complete -->
| Name | Type | Description | Default |
| ---------------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------- | --------------------------------------------------- |
| `max_epochs` | int | Maximum number of epochs. | `1000` |
| `min_length` | int | Minimum length of examples. | `5` |
| `max_length` | int | Maximum length of examples. | `500` |
| `dropout` | float | The dropout rate. | `0.2` |
| `n_save_every` | int | Saving frequency. | `null` |
| `batch_size` | int / `Sequence[int]` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). | `3000` |
| `seed` | int | The random seed. | `${system.seed}` |
| `use_pytorch_for_gpu_memory` | bool | Allocate memory via PyTorch. | `${system:use_pytorch_for_gpu_memory}` |
| `tok2vec_model` | str | tok2vec model section in the config. | `"components.tok2vec.model"` |
| `objective` | dict | The pretraining objective. | `{"type": "characters", "n_characters": 4}` |
| `optimizer` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | [`Adam`](https://thinc.ai/docs/api-optimizers#adam) |
## Training data {#training}
### Binary training format {#binary-training new="3"}
<!-- TODO: document DocBin format -->
> #### Example
>
> ```python
> from spacy.tokens import DocBin
> from spacy.gold import Corpus
>
> doc_bin = DocBin(docs=docs)
> doc_bin.to_disk("./data.spacy")
> reader = Corpus("./data.spacy")
> ```
### JSON input format for training {#json-input}
The main data format used in spaCy v3.0 is a **binary format** created by
serializing a [`DocBin`](/api/docbin) object, which represents a collection of
`Doc` objects. This means that you can train spaCy models using the same format
it outputs: annotated `Doc` objects. The binary format is extremely **efficient
in storage**, especially when packing multiple documents together.
spaCy takes training data in JSON format. The built-in
[`convert`](/api/cli#convert) command helps you convert the `.conllu` format
used by the
[Universal Dependencies corpora](https://github.com/UniversalDependencies) to
spaCy's training format. To convert one or more existing `Doc` objects to
spaCy's JSON format, you can use the
[`gold.docs_to_json`](/api/top-level#docs_to_json) helper.
Typically, the extension for these binary files is `.spacy`, and they are used
as input format for specifying a [training corpus](/api/corpus) and for spaCy's
CLI [`train`](/api/cli#train) command. The built-in
[`convert`](/api/cli#convert) command helps you convert spaCy's previous
[JSON format](#json-input) to the new binary format format. It also supports
conversion of the `.conllu` format used by the
[Universal Dependencies corpora](https://github.com/UniversalDependencies).
> #### Annotating entities {#biluo}
### JSON training format {#json-input tag="deprecated"}
<Infobox variant="warning" title="Changed in v3.0">
As of v3.0, the JSON input format is deprecated and is replaced by the
[binary format](#binary-training). Instead of converting [`Doc`](/api/doc)
objects to JSON, you can now serialize them directly using the
[`DocBin`](/api/docbin) container and then use them as input data.
[`spacy convert`](/api/cli) lets you convert your JSON data to the new `.spacy`
format:
```bash
$ python -m spacy convert ./data.json ./output
```
</Infobox>
> #### Annotating entities
>
> Named entities are provided in the
> [BILUO](/usage/linguistic-features#accessing-ner) notation. Tokens outside an
@ -68,152 +257,154 @@ spaCy's JSON format, you can use the
}]
```
<Accordion title="Sample JSON data" spaced>
Here's an example of dependencies, part-of-speech tags and names entities, taken
from the English Wall Street Journal portion of the Penn Treebank:
```json
https://github.com/explosion/spaCy/tree/master/examples/training/training-data.json
https://github.com/explosion/spaCy/blob/v2.3.x/examples/training/training-data.json
```
### Annotations in dictionary format {#dict-input}
</Accordion>
To create [`Example`](/api/example) objects, you can create a dictionary of the
gold-standard annotations `gold_dict`, and then call
### Annotation format for creating training examples {#dict-input}
```python
example = Example.from_dict(doc, gold_dict)
```
An [`Example`](/api/example) object holds the information for one training
instance. It stores two [`Doc`](/api/doc) objects: one for holding the
gold-standard reference data, and one for holding the predictions of the
pipeline. Examples can be created using the
[`Example.from_dict`](/api/example#from_dict) method with a reference `Doc` and
a dictionary of gold-standard annotations.
There are currently two formats supported for this dictionary of annotations:
one with a simple, flat structure of keywords, and one with a more hierarchical
structure.
> #### Example
>
> ```python
> example = Example.from_dict(doc, gold_dict)
> ```
#### Flat structure {#dict-flat}
<Infobox title="Important note" variant="warning">
Here is the full overview of potential entries in a flat dictionary of
annotations. You need to only specify those keys corresponding to the task you
want to train.
`Example` objects are used as part of the
[internal training API](/usage/training#api) and they're expected when you call
[`nlp.update`](/api/language#update). However, for most use cases, you
**shouldn't** have to write your own training scripts. It's recommended to train
your models via the [`spacy train`](/api/cli#train) command with a config file
to keep track of your settings and hyperparameters and your own
[registered functions](/usage/training/#custom-code) to customize the setup.
```python
### Flat dictionary
{
"text": string, # Raw text.
"words": List[string], # List of gold tokens.
"lemmas": List[string], # List of lemmas.
"spaces": List[bool], # List of boolean values indicating whether the corresponding tokens is followed by a space or not.
"tags": List[string], # List of fine-grained [POS tags](/usage/linguistic-features#pos-tagging).
"pos": List[string], # List of coarse-grained [POS tags](/usage/linguistic-features#pos-tagging).
"morphs": List[string], # List of [morphological features](/usage/linguistic-features#rule-based-morphology).
"sent_starts": List[bool], # List of boolean values indicating whether each token is the first of a sentence or not.
"deps": List[string], # List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head.
"heads": List[int], # List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text.
"entities": List[string], # Option 1: List of [BILUO tags](#biluo) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens.
"entities": List[(int, int, string)], # Option 2: List of `"(start, end, label)"` tuples defining all entities in.
"cats": Dict[str, float], # Dictionary of `label:value` pairs indicating how relevant a certain [category](/api/textcategorizer) is for the text.
"links": Dict[(int, int), Dict], # Dictionary of `offset:dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The charachter offsets are linked to a dictionary of relevant knowledge base IDs.
}
```
</Infobox>
There are a few caveats to take into account:
> #### Example
>
> ```python
> {
> "text": str,
> "words": List[str],
> "lemmas": List[str],
> "spaces": List[bool],
> "tags": List[str],
> "pos": List[str],
> "morphs": List[str],
> "sent_starts": List[bool],
> "deps": List[string],
> "heads": List[int],
> "entities": List[str],
> "entities": List[(int, int, str)],
> "cats": Dict[str, float],
> "links": Dict[(int, int), dict],
> }
> ```
| Name | Type | Description |
| ------------- | ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `text` | str | Raw text. |
| `words` | `List[str]` | List of gold-standard tokens. |
| `lemmas` | `List[str]` | List of lemmas. |
| `spaces` | `List[bool]` | List of boolean values indicating whether the corresponding tokens is followed by a space or not. |
| `tags` | `List[str]` | List of fine-grained [POS tags](/usage/linguistic-features#pos-tagging). |
| `pos` | `List[str]` | List of coarse-grained [POS tags](/usage/linguistic-features#pos-tagging). |
| `morphs` | `List[str]` | List of [morphological features](/usage/linguistic-features#rule-based-morphology). |
| `sent_starts` | `List[bool]` | List of boolean values indicating whether each token is the first of a sentence or not. |
| `deps` | `List[str]` | List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head. |
| `heads` | `List[int]` | List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text. |
| `entities` | `List[str]` | **Option 1:** List of [BILUO tags](/usage/linguistic-features#accessing-ner) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens. |
| `entities` | `List[Tuple[int, int, str]]` | **Option 2:** List of `"(start, end, label)"` tuples defining all entities in the text. |
| `cats` | `Dict[str, float]` | Dictionary of `label`/`value` pairs indicating how relevant a certain [text category](/api/textcategorizer) is for the text. |
| `links` | `Dict[(int, int), Dict]` | Dictionary of `offset`/`dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The character offsets are linked to a dictionary of relevant knowledge base IDs. |
<Infobox title="Notes and caveats">
- Multiple formats are possible for the "entities" entry, but you have to pick
one.
- Any values for sentence starts will be ignored if there are annotations for
dependency relations.
- If the dictionary contains values for "text" and "words", but not "spaces",
the latter are inferred automatically. If "words" is not provided either, the
values are inferred from the `doc` argument.
##### Examples
```python
# Training data for a part-of-speech tagger
doc = Doc(vocab, words=["I", "like", "stuff"])
example = Example.from_dict(doc, {"tags": ["NOUN", "VERB", "NOUN"]})
# Training data for an entity recognizer (option 1)
doc = nlp("Laura flew to Silicon Valley.")
biluo_tags = ["U-PERS", "O", "O", "B-LOC", "L-LOC"]
example = Example.from_dict(doc, {"entities": biluo_tags})
# Training data for an entity recognizer (option 2)
doc = nlp("Laura flew to Silicon Valley.")
entity_tuples = [
(0, 5, "PERSON"),
(14, 28, "LOC"),
]
example = Example.from_dict(doc, {"entities": entity_tuples})
# Training data for text categorization
doc = nlp("I'm pretty happy about that!")
example = Example.from_dict(doc, {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}})
# Training data for an Entity Linking component
doc = nlp("Russ Cochran his reprints include EC Comics.")
example = Example.from_dict(doc, {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}})
```
#### Hierachical structure {#dict-hierarch}
Internally, a more hierarchical dictionary structure is used to store
gold-standard annotations. Its format is similar to the structure described in
the previous section, but there are two main sections `token_annotation` and
`doc_annotation`, and the keys for token annotations should be uppercase
[`Token` attributes](/api/token#attributes) such as "ORTH" and "TAG".
```python
### Hierarchical dictionary
{
"text": string, # Raw text.
"token_annotation": {
"ORTH": List[string], # List of gold tokens.
"LEMMA": List[string], # List of lemmas.
"SPACY": List[bool], # List of boolean values indicating whether the corresponding tokens is followed by a space or not.
"TAG": List[string], # List of fine-grained [POS tags](/usage/linguistic-features#pos-tagging).
"POS": List[string], # List of coarse-grained [POS tags](/usage/linguistic-features#pos-tagging).
"MORPH": List[string], # List of [morphological features](/usage/linguistic-features#rule-based-morphology).
"SENT_START": List[bool], # List of boolean values indicating whether each token is the first of a sentence or not.
"DEP": List[string], # List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head.
"HEAD": List[int], # List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text.
},
"doc_annotation": {
"entities": List[(int, int, string)], # List of [BILUO tags](#biluo) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens.
"cats": Dict[str, float], # Dictionary of `label:value` pairs indicating how relevant a certain [category](/api/textcategorizer) is for the text.
"links": Dict[(int, int), Dict], # Dictionary of `offset:dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The charachter offsets are linked to a dictionary of relevant knowledge base IDs.
}
}
```
There are a few caveats to take into account:
- Any values for sentence starts will be ignored if there are annotations for
dependency relations.
- If the dictionary contains values for "text" and "ORTH", but not "SPACY", the
latter are inferred automatically. If "ORTH" is not provided either, the
values are inferred from the `doc` argument.
## Training config {#config new="3"}
Config files define the training process and model pipeline and can be passed to
[`spacy train`](/api/cli#train). They use
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
hood. For details on how to use training configs, see the
[usage documentation](/usage/training#config).
<Infobox variant="warning">
The `@` syntax lets you refer to function names registered in the
[function registry](/api/top-level#registry). For example,
`@architectures = "spacy.HashEmbedCNN.v1"` refers to a registered function of
the name `"spacy.HashEmbedCNN.v1"` and all other values defined in its block
will be passed into that function as arguments. Those arguments depend on the
registered function. See the [model architectures](/api/architectures) docs for
API details.
- If the dictionary contains values for `"text"` and `"words"`, but not
`"spaces"`, the latter are inferred automatically. If "words" is not provided
either, the values are inferred from the `Doc` argument.
</Infobox>
<!-- TODO: we need to come up with a good way to present the sections and their expected values visually? -->
<!-- TODO: once we know how we want to implement "starter config" workflow or outputting a full default config for the user, update this section with the command -->
```python
### Examples
# Training data for a part-of-speech tagger
doc = Doc(vocab, words=["I", "like", "stuff"])
gold_dict = {"tags": ["NOUN", "VERB", "NOUN"]}
example = Example.from_dict(doc, gold_dict)
# Training data for an entity recognizer (option 1)
doc = nlp("Laura flew to Silicon Valley.")
gold_dict = {"entities": ["U-PERS", "O", "O", "B-LOC", "L-LOC"]}
example = Example.from_dict(doc, gold_dict)
# Training data for an entity recognizer (option 2)
doc = nlp("Laura flew to Silicon Valley.")
gold_dict = {"entities": [(0, 5, "PERSON"), (14, 28, "LOC")]}
example = Example.from_dict(doc, gold_dict)
# Training data for text categorization
doc = nlp("I'm pretty happy about that!")
gold_dict = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
example = Example.from_dict(doc, gold_dict)
# Training data for an Entity Linking component
doc = nlp("Russ Cochran his reprints include EC Comics.")
gold_dict = {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}
example = Example.from_dict(doc, gold_dict)
```
## Pretraining data {#pretraining}
The [`spacy pretrain`](/api/cli#pretrain) command lets you pretrain the tok2vec
layer of pipeline components from raw text. Raw text can be provided as a
`.jsonl` (newline-delimited JSON) file containing one input text per line
(roughly paragraph length is good). Optionally, custom tokenization can be
provided.
> #### Tip: Writing JSONL
>
> Our utility library [`srsly`](https://github.com/explosion/srsly) provides a
> handy `write_jsonl` helper that takes a file path and list of dictionaries and
> writes out JSONL-formatted data.
>
> ```python
> import srsly
> data = [{"text": "Some text"}, {"text": "More..."}]
> srsly.write_jsonl("/path/to/text.jsonl", data)
> ```
| Key | Type | Description |
| -------- | ---- | ---------------------------------------------------------- |
| `text` | str | The raw input text. Is not required if `tokens` available. |
| `tokens` | list | Optional tokenization, one string per token. |
```json
### Example
{"text": "Can I ask where you work now and what you do, and if you enjoy it?"}
{"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."}
{"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."}
{"tokens": ["If", "tokens", "are", "provided", "then", "we", "can", "skip", "the", "raw", "input", "text"]}
```
## Lexical data for vocabulary {#vocab-jsonl new="2"}

View File

@ -265,37 +265,6 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
| ----------- | -------------------------------------- | ----------------------------------------------- |
| **RETURNS** | `numpy.ndarray[ndim=2, dtype="int32"]` | The lowest common ancestor matrix of the `Doc`. |
## Doc.to_json {#to_json tag="method" new="2.1"}
Convert a Doc to JSON. The format it produces will be the new format for the
[`spacy train`](/api/cli#train) command (not implemented yet). If custom
underscore attributes are specified, their values need to be JSON-serializable.
They'll be added to an `"_"` key in the data, e.g. `"_": {"foo": "bar"}`.
> #### Example
>
> ```python
> doc = nlp("Hello")
> json_doc = doc.to_json()
> ```
>
> #### Result
>
> ```python
> {
> "text": "Hello",
> "ents": [],
> "sents": [{"start": 0, "end": 5}],
> "tokens": [{"id": 0, "start": 0, "end": 5, "pos": "INTJ", "tag": "UH", "dep": "ROOT", "head": 0}
> ]
> }
> ```
| Name | Type | Description |
| ------------ | ---- | ------------------------------------------------------------------------------ |
| `underscore` | list | Optional list of string names of custom JSON-serializable `doc._.` attributes. |
| **RETURNS** | dict | The JSON-formatted data. |
## Doc.to_array {#to_array tag="method"}
Export given token attributes to a numpy `ndarray`. If `attr_ids` is a sequence

View File

@ -125,7 +125,8 @@ Serialize the `DocBin`'s annotations to a bytestring.
> #### Example
>
> ```python
> doc_bin = DocBin(attrs=["DEP", "HEAD"])
> docs = [nlp("Hello world!")]
> doc_bin = DocBin(docs=docs)
> doc_bin_bytes = doc_bin.to_bytes()
> ```
@ -148,3 +149,36 @@ Deserialize the `DocBin`'s annotations from a bytestring.
| ------------ | -------- | ---------------------- |
| `bytes_data` | bytes | The data to load from. |
| **RETURNS** | `DocBin` | The loaded `DocBin`. |
## DocBin.to_disk {#to_disk tag="method" new="3"}
Save the serialized `DocBin` to a file. Typically uses the `.spacy` extension
and the result can be used as the input data for
[`spacy train`](/api/cli#train).
> #### Example
>
> ```python
> docs = [nlp("Hello world!")]
> doc_bin = DocBin(docs=docs)
> doc_bin.to_disk("./data.spacy")
> ```
| Argument | Type | Description |
| -------- | ------------ | ----------------------------------------------------- |
| `path` | str / `Path` | The file path, typically with the `.spacy` extension. |
## DocBin.from_disk {#from_disk tag="method" new="3"}
Load a serialized `DocBin` from a file. Typically uses the `.spacy` extension.
> #### Example
>
> ```python
> doc_bin = DocBin().from_disk("./data.spacy")
> ```
| Argument | Type | Description |
| ----------- | ------------ | ----------------------------------------------------- |
| `path` | str / `Path` | The file path, typically with the `.spacy` extension. |
| **RETURNS** | `DocBin` | The loaded `DocBin`. |

View File

@ -9,6 +9,13 @@ api_string_name: entity_linker
api_trainable: true
---
An `EntityLinker` component disambiguates textual mentions (tagged as named
entities) to unique identifiers, grounding the named entities into the "real
world". It requires a `KnowledgeBase`, as well as a function to generate
plausible candidates from that `KnowledgeBase` given a certain textual mention,
and a ML model to pick the right candidate, given the local context of the
mention.
## Config and implementation {#config}
The default config is defined by the pipeline component factory and describes
@ -23,22 +30,24 @@ architectures and their arguments and hyperparameters.
> ```python
> from spacy.pipeline.entity_linker import DEFAULT_NEL_MODEL
> config = {
> "kb": None,
> "labels_discard": [],
> "incl_prior": True,
> "incl_context": True,
> "model": DEFAULT_NEL_MODEL,
> "kb_loader": {'@assets': 'spacy.EmptyKB.v1', 'entity_vector_length': 64},
> "get_candidates": {'@assets': 'spacy.CandidateGenerator.v1'},
> }
> nlp.add_pipe("entity_linker", config=config)
> ```
| Setting | Type | Description | Default |
| ---------------- | ------------------------------------------ | ----------------------------------------------------------------------- | ----------------------------------------------- |
| `kb` | `KnowledgeBase` | The [`KnowledgeBase`](/api/kb) holding all entities and their aliases. | `None` |
| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. | `[]` |
| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. | `True` |
| `incl_context` | bool | Whether or not to include the local context in the model. | `True` |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) |
| Setting | Type | Description | Default |
| ---------------- | -------------------------------------------------------- | --------------------------------------------------------------------------- | ------------------------------------------------------ |
| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. | `[]` |
| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. | `True` |
| `incl_context` | bool | Whether or not to include the local context in the model. | `True` |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) |
| `kb_loader` | `Callable[[Vocab], KnowledgeBase]` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. | An empty KnowledgeBase with `entity_vector_length` 64. |
| `get_candidates` | `Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]` | Function that generates plausible candidates for a given `Span` object. | Built-in dictionary-lookup function. |
```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py
@ -53,7 +62,11 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py
> entity_linker = nlp.add_pipe("entity_linker")
>
> # Construction via add_pipe with custom model
> config = {"model": {"@architectures": "my_el"}}
> config = {"model": {"@architectures": "my_el.v1"}}
> entity_linker = nlp.add_pipe("entity_linker", config=config)
>
> # Construction via add_pipe with custom KB and candidate generation
> config = {"kb_loader": {"@assets": "my_kb.v1"}, "get_candidates": {"@assets": "my_candidates.v1"},}
> entity_linker = nlp.add_pipe("entity_linker", config=config)
>
> # Construction from class
@ -65,18 +78,20 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
<!-- TODO: finish API docs -->
Note that both the internal KB as well as the Candidate generator can be
customized by providing custom registered functions.
| Name | Type | Description |
| ---------------- | --------------- | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. |
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| _keyword-only_ | | |
| `kb` | `KnowlegeBase` | The [`KnowledgeBase`](/api/kb) holding all entities and their aliases. |
| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. |
| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. |
| `incl_context` | bool | Whether or not to include the local context in the model. |
| Name | Type | Description |
| ---------------- | -------------------------------------------------------- | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. |
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| _keyword-only_ | | |
| `kb_loader` | `Callable[[Vocab], KnowledgeBase]` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. |
| `get_candidates` | `Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]` | Function that generates plausible candidates for a given `Span` object. |
| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. |
| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. |
| `incl_context` | bool | Whether or not to include the local context in the model. |
## EntityLinker.\_\_call\_\_ {#call tag="method"}

View File

@ -27,7 +27,7 @@ how the component should be configured. You can override its settings via the
> ```python
> config = {
> "phrase_matcher_attr": None,
> "validation": True,
> "validate": True,
> "overwrite_ents": False,
> "ent_id_sep": "||",
> }
@ -37,7 +37,7 @@ how the component should be configured. You can override its settings via the
| Setting | Type | Description | Default |
| --------------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
| `phrase_matcher_attr` | str | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. | `None` |
| `validation` | bool | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. | `False` |
| `validate` | bool | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). | `False` |
| `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. | `False` |
| `ent_id_sep` | str | Separator used internally for entity IDs. | `"||"` |

View File

@ -244,8 +244,7 @@ accuracy of predicted entities against the original gold-standard annotation.
## Example.to_dict {#to_dict tag="method"}
Return a
[hierarchical dictionary representation](/api/data-formats#dict-hierarch) of the
Return a [dictionary representation](/api/data-formats#dict-input) of the
reference annotation contained in this `Example`.
> #### Example
@ -256,7 +255,7 @@ reference annotation contained in this `Example`.
| Name | Type | Description |
| ----------- | ---------------- | ------------------------------------------------------ |
| **RETURNS** | `Dict[str, obj]` | Dictionary representation of the reference annotation. |
| **RETURNS** | `Dict[str, Any]` | Dictionary representation of the reference annotation. |
## Example.split_sents {#split_sents tag="method"}

View File

@ -1,102 +1,263 @@
---
title: Lemmatizer
teaser: Assign the base forms of words
tag: class
source: spacy/lemmatizer.py
source: spacy/pipeline/lemmatizer.py
new: 3
teaser: 'Pipeline component for lemmatization'
api_base_class: /api/pipe
api_string_name: lemmatizer
api_trainable: false
---
<!-- TODO: rewrite once it's converted to pipe -->
## Config and implementation
The `Lemmatizer` supports simple part-of-speech-sensitive suffix rules and
lookup tables.
The default config is defined by the pipeline component factory and describes
how the component should be configured. You can override its settings via the
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
[`config.cfg` for training](/usage/training#config).
For examples of the lookups data formats used by the lookup and rule-based
lemmatizers, see the
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) repo.
> #### Example
>
> ```python
> config = {"mode": "rule"}
> nlp.add_pipe("lemmatizer", config=config)
> ```
| Setting | Type | Description | Default |
| ----------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- |
| `mode` | str | The lemmatizer mode, e.g. "lookup" or "rule". | `"lookup"` |
| `lookups` | [`Lookups`](/api/lookups) | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from `spacy-lookups-data`. | `None` |
| `overwrite` | bool | Whether to overwrite existing lemmas. | `False` |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Not yet implemented:** the model to use. | `None` |
```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/lemmatizer.py
```
## Lemmatizer.\_\_init\_\_ {#init tag="method"}
Initialize a `Lemmatizer`. Typically, this happens under the hood within spaCy
when a `Language` subclass and its `Vocab` is initialized.
> #### Example
>
> ```python
> from spacy.lemmatizer import Lemmatizer
> from spacy.lookups import Lookups
> lookups = Lookups()
> lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
> lemmatizer = Lemmatizer(lookups)
> ```
> # Construction via add_pipe with default model
> lemmatizer = nlp.add_pipe("lemmatizer")
>
> For examples of the data format, see the
> [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) repo.
> # Construction via add_pipe with custom settings
> config = {"mode": "rule", overwrite=True}
> lemmatizer = nlp.add_pipe("lemmatizer", config=config)
> ```
| Name | Type | Description |
| -------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
| `lookups` <Tag variant="new">2.2</Tag> | [`Lookups`](/api/lookups) | The lookups object containing the (optional) tables `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. |
Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
| Name | Type | Description |
| -------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | [`Vocab`](/api/vocab) | The vocab. |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model (not yet implemented). |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| _keyword-only_ | | |
| mode | str | The lemmatizer mode, e.g. "lookup" or "rule". Defaults to "lookup". |
| lookups | [`Lookups`](/api/lookups) | A lookups object containing the tables such as "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". Defaults to `None`. |
| overwrite | bool | Whether to overwrite existing lemmas. |
## Lemmatizer.\_\_call\_\_ {#call tag="method"}
Lemmatize a string.
Apply the pipe to one document. The document is modified in place, and returned.
This usually happens under the hood when the `nlp` object is called on a text
and all pipeline components are applied to the `Doc` in order.
> #### Example
>
> ```python
> from spacy.lemmatizer import Lemmatizer
> from spacy.lookups import Lookups
> lookups = Lookups()
> lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
> lemmatizer = Lemmatizer(lookups)
> lemmas = lemmatizer("ducks", "NOUN")
> assert lemmas == ["duck"]
> doc = nlp("This is a sentence.")
> lemmatizer = nlp.add_pipe("lemmatizer")
> # This usually happens under the hood
> processed = lemmatizer(doc)
> ```
| Name | Type | Description |
| ------------ | ------------- | -------------------------------------------------------------------------------------------------------- |
| `string` | str | The string to lemmatize, e.g. the token text. |
| `univ_pos` | str / int | The token's universal part-of-speech tag. |
| `morphology` | dict / `None` | Morphological features following the [Universal Dependencies](http://universaldependencies.org/) scheme. |
| **RETURNS** | list | The available lemmas for the string. |
| Name | Type | Description |
| ----------- | ----- | ------------------------ |
| `doc` | `Doc` | The document to process. |
| **RETURNS** | `Doc` | The processed document. |
## Lemmatizer.lookup {#lookup tag="method" new="2"}
## Lemmatizer.pipe {#pipe tag="method"}
Look up a lemma in the lookup table, if available. If no lemma is found, the
Apply the pipe to a stream of documents. This usually happens under the hood
when the `nlp` object is called on a text and all pipeline components are
applied to the `Doc` in order.
> #### Example
>
> ```python
> lemmatizer = nlp.add_pipe("lemmatizer")
> for doc in lemmatizer.pipe(docs, batch_size=50):
> pass
> ```
| Name | Type | Description |
| -------------- | --------------- | ------------------------------------------------------ |
| `stream` | `Iterable[Doc]` | A stream of documents. |
| _keyword-only_ | | |
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"}
Lemmatize a token using a lookup-based approach. If no lemma is found, the
original string is returned. Languages can provide a
[lookup table](/usage/adding-languages#lemmatizer) via the `Lookups`.
> #### Example
>
> ```python
> lookups = Lookups()
> lookups.add_table("lemma_lookup", {"going": "go"})
> assert lemmatizer.lookup("going") == "go"
> ```
| Name | Type | Description |
| ----------- | --------------------- | ------------------------------------- |
| `token` | [`Token`](/api/token) | The token to lemmatize. |
| **RETURNS** | `List[str]` | A list containing one or more lemmas. |
| Name | Type | Description |
| ----------- | ---- | ----------------------------------------------------------------------------------------------------------- |
| `string` | str | The string to look up. |
| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. |
| **RETURNS** | str | The lemma if the string was found, otherwise the original string. |
## Lemmatizer.rule_lemmatize {#rule_lemmatize tag="method"}
Lemmatize a token using a rule-based approach. Typically relies on POS tags.
| Name | Type | Description |
| ----------- | --------------------- | ------------------------------------- |
| `token` | [`Token`](/api/token) | The token to lemmatize. |
| **RETURNS** | `List[str]` | A list containing one or more lemmas. |
## Lemmatizer.is_base_form {#is_base_form tag="method"}
Check whether we're dealing with an uninflected paradigm, so we can avoid
lemmatization entirely.
| Name | Type | Description |
| ----------- | --------------------- | ------------------------------------------------------------------------------------------------------- |
| `token` | [`Token`](/api/token) | The token to analyze. |
| **RETURNS** | bool | Whether the token's attributes (e.g., part-of-speech tag, morphological features) describe a base form. |
## Lemmatizer.get_lookups_config {#get_lookups_config tag="classmethod"}
Returns the lookups configuration settings for a given mode for use in
[`Lemmatizer.load_lookups`](#load_lookups).
| Name | Type | Description |
| ----------- | ---- | ------------------------------------------------- |
| `mode` | str | The lemmatizer mode. |
| **RETURNS** | dict | The lookups configuration settings for this mode. |
## Lemmatizer.load_lookups {#load_lookups tag="classmethod"}
Load and validate lookups tables. If the provided lookups is `None`, load the
default lookups tables according to the language and mode settings. Confirm that
all required tables for the language and mode are present.
| Name | Type | Description |
| ----------- | ------------------------- | ---------------------------------------------------------------------------- |
| `lang` | str | The language. |
| `mode` | str | The lemmatizer mode. |
| `lookups` | [`Lookups`](/api/lookups) | The provided lookups, may be `None` if the default lookups should be loaded. |
| **RETURNS** | [`Lookups`](/api/lookups) | The lookups object. |
## Lemmatizer.to_disk {#to_disk tag="method"}
Serialize the pipe to disk.
> #### Example
>
> ```python
> pos = "verb"
> morph = {"VerbForm": "inf"}
> is_base_form = lemmatizer.is_base_form(pos, morph)
> assert is_base_form == True
> lemmatizer = nlp.add_pipe("lemmatizer")
> lemmatizer.to_disk("/path/to/lemmatizer")
> ```
| Name | Type | Description |
| ------------ | --------- | --------------------------------------------------------------------------------------- |
| `univ_pos` | str / int | The token's universal part-of-speech tag. |
| `morphology` | dict | The token's morphological features. |
| **RETURNS** | bool | Whether the token's part-of-speech tag and morphological features describe a base form. |
| Name | Type | Description |
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| _keyword-only_ | | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
## Lemmatizer.from_disk {#from_disk tag="method"}
Load the pipe from disk. Modifies the object in place and returns it.
> #### Example
>
> ```python
> lemmatizer = nlp.add_pipe("lemmatizer")
> lemmatizer.from_disk("/path/to/lemmatizer")
> ```
| Name | Type | Description |
| -------------- | --------------- | -------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| _keyword-only_ | | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Lemmatizer` | The modified `Lemmatizer` object. |
## Lemmatizer.to_bytes {#to_bytes tag="method"}
> #### Example
>
> ```python
> lemmatizer = nlp.add_pipe("lemmatizer")
> lemmatizer_bytes = lemmatizer.to_bytes()
> ```
Serialize the pipe to a bytestring.
| Name | Type | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- |
| _keyword-only_ | | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | bytes | The serialized form of the `Lemmatizer` object. |
## Lemmatizer.from_bytes {#from_bytes tag="method"}
Load the pipe from a bytestring. Modifies the object in place and returns it.
> #### Example
>
> ```python
> lemmatizer_bytes = lemmatizer.to_bytes()
> lemmatizer = nlp.add_pipe("lemmatizer")
> lemmatizer.from_bytes(lemmatizer_bytes)
> ```
| Name | Type | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. |
| _keyword-only_ | | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Lemmatizer` | The `Lemmatizer` object. |
## Lemmatizer.mode {#mode tag="property"}
The lemmatizer mode.
| Name | Type | Description |
| ----------- | ----- | -------------------- |
| **RETURNS** | `str` | The lemmatizer mode. |
## Attributes {#attributes}
| Name | Type | Description |
| -------------------------------------- | ------------------------- | --------------------------------------------------------------- |
| `lookups` <Tag variant="new">2.2</Tag> | [`Lookups`](/api/lookups) | The lookups object containing the rules and data, if available. |
| Name | Type | Description |
| --------- | --------------------------------- | ------------------- |
| `vocab` | The shared [`Vocab`](/api/vocab). |
| `lookups` | [`Lookups`](/api/lookups) | The lookups object. |
## Serialization fields {#serialization-fields}
During serialization, spaCy will export several data fields used to restore
different aspects of the object. If needed, you can exclude them from
serialization by passing in the string names via the `exclude` argument.
> #### Example
>
> ```python
> data = lemmatizer.to_disk("/path", exclude=["vocab"])
> ```
| Name | Description |
| --------- | ---------------------------------------------------- |
| `vocab` | The shared [`Vocab`](/api/vocab). |
| `lookups` | The lookups. You usually don't want to exclude this. |

View File

@ -11,22 +11,19 @@ this class.
## Morphology.\_\_init\_\_ {#init tag="method"}
Create a Morphology object using the tag map, lemmatizer and exceptions.
Create a Morphology object.
> #### Example
>
> ```python
> from spacy.morphology import Morphology
>
> morphology = Morphology(strings, tag_map, lemmatizer)
> morphology = Morphology(strings)
> ```
| Name | Type | Description |
| ------------ | ----------------- | ---------------------------------------------------------------------------------------------------------- |
| `strings` | `StringStore` | The string store. |
| `tag_map` | `Dict[str, Dict]` | The tag map. |
| `lemmatizer` | `Lemmatizer` | The lemmatizer. |
| `exc` | `Dict[str, Dict]` | A dictionary of exceptions in the format `{tag: {orth: {"POS": "X", "Feat1": "Val1, "Feat2": "Val2", ...}` |
| Name | Type | Description |
| --------- | ------------- | ----------------- |
| `strings` | `StringStore` | The string store. |
## Morphology.add {#add tag="method"}
@ -62,52 +59,6 @@ Get the FEATS string for the hash of the morphological analysis.
| ------- | ---- | --------------------------------------- |
| `morph` | int | The hash of the morphological analysis. |
## Morphology.load_tag_map {#load_tag_map tag="method"}
Replace the current tag map with the provided tag map.
| Name | Type | Description |
| --------- | ----------------- | ------------ |
| `tag_map` | `Dict[str, Dict]` | The tag map. |
## Morphology.load_morph_exceptions {#load_morph_exceptions tag="method"}
Replace the current morphological exceptions with the provided exceptions.
| Name | Type | Description |
| ------------- | ----------------- | ----------------------------- |
| `morph_rules` | `Dict[str, Dict]` | The morphological exceptions. |
## Morphology.add_special_case {#add_special_case tag="method"}
Add a special-case rule to the morphological analyzer. Tokens whose tag and orth
match the rule will receive the specified properties.
> #### Example
>
> ```python
> attrs = {"POS": "DET", "Definite": "Def"}
> morphology.add_special_case("DT", "the", attrs)
> ```
| Name | Type | Description |
| ---------- | ---- | ---------------------------------------------- |
| `tag_str` | str | The fine-grained tag. |
| `orth_str` | str | The token text. |
| `attrs` | dict | The features to assign for this token and tag. |
## Morphology.exc {#exc tag="property"}
The current morphological exceptions.
| Name | Type | Description |
| ---------- | ---- | --------------------------------------------------- |
| **YIELDS** | dict | The current dictionary of morphological exceptions. |
## Morphology.lemmatize {#lemmatize tag="method"}
TODO
## Morphology.feats_to_dict {#feats_to_dict tag="staticmethod"}
Convert a string FEATS representation to a dictionary of features and values in

View File

@ -47,7 +47,7 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tagger.pyx
>
> # Construction via create_pipe with custom model
> config = {"model": {"@architectures": "my_tagger"}}
> parser = nlp.add_pipe("tagger", config=config)
> tagger = nlp.add_pipe("tagger", config=config)
>
> # Construction from class
> from spacy.pipeline import Tagger
@ -285,16 +285,14 @@ Add a new label to the pipe.
> #### Example
>
> ```python
> from spacy.symbols import POS
> tagger = nlp.add_pipe("tagger")
> tagger.add_label("MY_LABEL", {POS: "NOUN"})
> tagger.add_label("MY_LABEL")
> ```
| Name | Type | Description |
| ----------- | ---------------- | --------------------------------------------------------------- |
| `label` | str | The label to add. |
| `values` | `Dict[int, str]` | Optional values to map to the label, e.g. a tag map dictionary. |
| **RETURNS** | int | `0` if the label is already present, otherwise `1`. |
| Name | Type | Description |
| ----------- | ---- | --------------------------------------------------- |
| `label` | str | The label to add. |
| **RETURNS** | int | `0` if the label is already present, otherwise `1`. |
## Tagger.to_disk {#to_disk tag="method"}
@ -369,9 +367,7 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
## Tagger.labels {#labels tag="property"}
The labels currently added to the component. Note that even for a blank
component, this will always include the built-in coarse-grained part-of-speech
tags by default, e.g. `VERB`, `NOUN` and so on.
The labels currently added to the component.
> #### Example
>
@ -396,9 +392,8 @@ serialization by passing in the string names via the `exclude` argument.
> data = tagger.to_disk("/path", exclude=["vocab"])
> ```
| Name | Description |
| --------- | ------------------------------------------------------------------------------------------ |
| `vocab` | The shared [`Vocab`](/api/vocab). |
| `cfg` | The config file. You usually don't want to exclude this. |
| `model` | The binary model data. You usually don't want to exclude this. |
| `tag_map` | The [tag map](/usage/adding-languages#tag-map) mapping fine-grained to coarse-grained tag. |
| Name | Description |
| ------- | -------------------------------------------------------------- |
| `vocab` | The shared [`Vocab`](/api/vocab). |
| `cfg` | The config file. You usually don't want to exclude this. |
| `model` | The binary model data. You usually don't want to exclude this. |

View File

@ -5,9 +5,20 @@ tag: class
source: spacy/tokenizer.pyx
---
> #### Default config
>
> ```ini
> [nlp.tokenizer]
> @tokenizers = "spacy.Tokenizer.v1"
> ```
Segment text, and create `Doc` objects with the discovered segment boundaries.
For a deeper understanding, see the docs on
[how spaCy's tokenizer works](/usage/linguistic-features#how-tokenizer-works).
The tokenizer is typically created automatically when the a
[`Language`](/api/language) subclass is initialized and it reads its settings
like punctuation and special case rules from the
[`Language.Defaults`](/api/language#defaults) provided by the language subclass.
## Tokenizer.\_\_init\_\_ {#init tag="method"}

View File

@ -4,7 +4,7 @@ menu:
- ['spacy', 'spacy']
- ['displacy', 'displacy']
- ['registry', 'registry']
- ['Readers & Batchers', 'readers-batchers']
- ['Batchers', 'batchers']
- ['Data & Alignment', 'gold']
- ['Utility Functions', 'util']
---
@ -299,13 +299,14 @@ factories.
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. |
| `factories` | Registry for functions that create [pipeline components](/usage/processing-pipelines#custom-components). Added automatically when you use the `@spacy.component` decorator and also reads from [entry points](/usage/saving-loading#entry-points) |
| `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. |
| `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). |
| `lookups` | Registry for large lookup tables available via `vocab.lookups`. |
| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). |
| `assets` | |
| `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. |
| `readers` | Registry for training and evaluation [data readers](#readers-batchers). |
| `batchers` | Registry for training and evaluation [data batchers](#readers-batchers). |
| `readers` | Registry for training and evaluation data readers like [`Corpus`](/api/corpus). |
| `batchers` | Registry for training and evaluation [data batchers](#batchers). |
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
| `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). |
@ -337,42 +338,9 @@ See the [`Transformer`](/api/transformer) API reference and
| [`span_getters`](/api/transformer#span_getters) | Registry for functions that take a batch of `Doc` objects and return a list of `Span` objects to process by the transformer, e.g. sentences. |
| [`annotation_setters`](/api/transformer#annotation_setters) | Registry for functions that create annotation setters. Annotation setters are functions that take a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. |
## Data readers and batchers {#readers-batchers new="3"}
## Batchers {#batchers source="spacy/gold/batchers.py" new="3"}
<!-- TODO: -->
### spacy.Corpus.v1 {#corpus tag="registered function" source="spacy/gold/corpus.py"}
Registered function that creates a [`Corpus`](/api/corpus) of training or
evaluation data. It takes the same arguments as the `Corpus` class and returns a
callable that yields [`Example`](/api/example) objects. You can replace it with
your own registered function in the [`@readers` registry](#regsitry) to
customize the data loading and streaming.
> #### Example config
>
> ```ini
> [paths]
> train = "corpus/train.spacy"
>
> [training.train_corpus]
> @readers = "spacy.Corpus.v1"
> path = ${paths:train}
> gold_preproc = false
> max_length = 0
> limit = 0
> ```
| Name | Type | Description |
| --------------- | ------ | ----------------------------------------------------------------------------------------------------------------------------------------------- |
| `path` | `Path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). |
|  `gold_preproc` | bool | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. |
| `max_length` | int | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. |
| `limit` | int | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. |
### Batchers {#batchers source="spacy/gold/batchers.py"}
<!-- TODO: -->
<!-- TODO: intro and also describe signature of functions -->
#### batch_by_words.v1 {#batch_by_words tag="registered function"}
@ -446,28 +414,6 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
## Training data and alignment {#gold source="spacy/gold"}
### gold.docs_to_json {#docs_to_json tag="function"}
Convert a list of Doc objects into the
[JSON-serializable format](/api/data-formats#json-input) used by the
[`spacy train`](/api/cli#train) command. Each input doc will be treated as a
'paragraph' in the output doc.
> #### Example
>
> ```python
> from spacy.gold import docs_to_json
>
> doc = nlp("I like London")
> json_data = docs_to_json([doc])
> ```
| Name | Type | Description |
| ----------- | ---------------- | ------------------------------------------ |
| `docs` | iterable / `Doc` | The `Doc` object(s) to convert. |
| `id` | int | ID to assign to the JSON. Defaults to `0`. |
| **RETURNS** | dict | The data in spaCy's JSON format. |
### gold.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"}
Encode labelled spans into per-token tags, using the

View File

@ -24,8 +24,6 @@ Create the vocabulary.
| Name | Type | Description |
| -------------------------------------------- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `lex_attr_getters` | dict | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. |
| `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. |
| `lemmatizer` | object | A lemmatizer. Defaults to `None`. |
| `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. |
| `lookups` | `Lookups` | A [`Lookups`](/api/lookups) that stores the `lemma_\*`, `lexeme_norm` and other large lookup tables. Defaults to `None`. |
| `lookups_extra` <Tag variant="new">2.3</Tag> | `Lookups` | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. |

View File

@ -1,85 +0,0 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="931" height="456" viewBox="-1 -1 932 480" preserveAspectRatio="xMinYMin meet">
<style>
.svg__langdata__text-large, .svg__langdata__text-small, .svg__langdata__text-tiny {
font-family: Arial, sans-serif;
fill: #1a1e23
}
.svg__langdata__text-large { font-size: 20px }
.svg__langdata__text-small, .svg__langdata__text-tiny { font-weight: bold; font-size: 15px; }
</style>
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M610 404h-69.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M534.2 404l8-4-2 4 2 4z"/>
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M642.7 361.3H708l33 43.6-33 43.5H643L610 405z"/>
<text class="svg__langdata__text-large" transform="translate(630 410)" width="80" height="22">Tokenizer</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M780 303H621v-56.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M621 240.2l4 8-4-2-4 2z"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M855 253v-20.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M855 226.2l4 8-4-2-4 2z"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M780 303h-45v-56.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M735 240.2l4 8-4-2-4 2z"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M780 303H504v-56.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M504 240.2l4 8-4-2-4 2z"/>
<ellipse cx="855" cy="303" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="svg__langdata__text-large" transform="translate(815 308)" width="119" height="46">Base data</text>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100l.4 39.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389.5 145.8l-4-8 4 2 4-2z"/>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17h232v22.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M621 145.8l-4-8 4 2 4-2z"/>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17H280v22.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M280 145.8l-4-8 4 2 4-2z"/>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17h115v22.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M504 145.8l-4-8 4 2 4-2z"/>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17h346v22.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M735 145.8l-4-8 4 2 4-2z"/>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17H163v22.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M163 145.8l-4-8 4 2 4-2z"/>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17H46v22.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M46 145.8l-4-8 4 2 4-2z"/>
<ellipse cx="389" cy="50" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="svg__langdata__text-large" transform="translate(346.5 42)" width="81" height="46">Language <tspan dy="1.45em" dx="-3.7em">data</tspan></text>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M435 193h15.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M456.8 193l-8 4 2-4-2-4z"/>
<ellipse cx="390" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="svg__langdata__text-small" transform="translate(368 187.5)" width="39" height="30">stop <tspan dx="-2.8em" dy="1.25em">words</tspan></text>
<path fill="none" stroke="#9673a6" stroke-width="3" stroke-miterlimit="10" d="M472 225l-1.5 133.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M470.4 364.8l-4-8 4 2 4-2z"/>
<ellipse cx="504" cy="193" fill="#f5f5f5" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="svg__langdata__text-small" transform="translate(473 187.5)" width="85" height="30">lexical <tspan dx="-4em" dy="1.25em">attributes</tspan></text>
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M653 225l5.6 127.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M659 358.8l-4.5-8 4 2 4-2.2z"/>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M576 193h-18.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M551.2 193l8-4-2 4 2 4z"/>
<ellipse cx="621" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="svg__langdata__text-small" transform="translate(582 187.5)" width="85" height="30">tokenizer <tspan dx="-5.2em" dy="1.25em">exceptions</tspan></text>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M690 193h-15.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M668.2 193l8-4-2 4 2 4z"/>
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M703 225l-10.3 127.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M692.2 358.8l-3.4-8.3 4 2.3 4-1.7z"/>
<ellipse cx="735" cy="193" fill="#f5f5f5" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="svg__langdata__text-small" transform="translate(705 182)" width="53" height="46">prefixes, <tspan dy="1.25em" dx="-4.4em">suffixes,</tspan> <tspan dy="1.25em" dx="-4em">infixes</tspan>
</text>
<path fill="none" stroke="#d79b00" stroke-width="3" stroke-miterlimit="10" d="M280 238v114.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#d79b00" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M280 358.8l-4-8 4 2 4-2z"/>
<ellipse cx="280" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="svg__langdata__text-small" transform="translate(254 187.5)" width="71" height="30">lemma <tspan dy="1.25em" dx="-3em">data</tspan></text>
<path fill="none" stroke="#d79b00" stroke-width="3" stroke-miterlimit="10" d="M346 404h53.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#d79b00" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M405.8 404l-8 4 2-4-2-4z"/>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M247.7 361.3H313l33 43.6-33 43.5h-65.3L215 405z"/>
<text class="svg__langdata__text-large" transform="translate(228 410)" width="100" height="22">Lemmatizer</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M823 193h-34.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M782.2 193l8-4-2 4 2 4z"/>
<ellipse cx="855" cy="193" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="31.5" ry="31.5"/>
<text class="svg__langdata__text-tiny" transform="translate(829 189)" width="50" height="30">char <tspan dy="1.1em" dx="-3.1em">classes</tspan></text>
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M408 367h124v74H408z"/>
<text class="svg__langdata__text-large" transform="translate(443.5 410)" width="51" height="22">Token</text>
<path fill="none" stroke="#666" stroke-width="3" stroke-miterlimit="10" d="M131 225l-21 122.2" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M109 353l-2.5-8.5 3.6 2.7 4.4-1.3z"/>
<ellipse cx="163" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="svg__langdata__text-small" transform="translate(139 187.5)" width="45" height="30">morph <tspan dy="1.25em" dx="-3.2em">rules</tspan></text>
<path fill="none" stroke="#666" stroke-width="3" stroke-miterlimit="10" d="M78 225l15.4 122" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M94.2 353l-5-7.5 4.2 1.5 3.7-2.5z"/>
<ellipse cx="46" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="svg__langdata__text-small" transform="translate(33 187.5)" width="27" height="30">tag <tspan dy="1.25em" dx="-2em">map</tspan></text>
<ellipse cx="101" cy="405" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.5" ry="49.5"/>
<text class="svg__langdata__text-large" transform="translate(49.5 410)" width="100" height="22">Morphology</text>
</svg>

Before

Width:  |  Height:  |  Size: 9.1 KiB

View File

@ -1,123 +1,305 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="600" height="380" viewBox="-20 -10 550 400">
<style>
.svg__tokenization__text { fill: #1a1e23; font: 18px Arial, sans-serif }
.svg__tokenization__text-small { fill: #fff; font: 600 13px Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace }
</style>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 39v12H16v11M71 39v12h20v11"/>
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M1 1h140v38.2H1z"/>
<text class="svg__tokenization__text" dy="1em" width="43" height="19" transform="translate(48.5 9.5)">“Lets</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 39v23"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 1h50v38.2h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 9.5)" width="19" height="19">go</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 39v23"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 1h50v38.2h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 9.5)" width="15" height="19">to</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 39v23"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 1h141v38.2H270z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 9.5)" width="38" height="19">N.Y.!”</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 100v20"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 62h30v38.2H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 70.5)" width="7" height="19"></text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M91 100v11H66v9M91 100v11h29v9"/>
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M41 62h100v38.2H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(72.5 70.5)" width="35" height="19">Lets</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 100v20"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 62h50v38.2h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 70.5)" width="19" height="19">go</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 100v20"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 62h50v38.2h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 70.5)" width="15" height="19">to</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 100v20"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 62h141v38.2H270z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 70.5)" width="38" height="19">N.Y.!”</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 120h30v38H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 128.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 158v24"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 120h50v38H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 128.5)" width="23" height="19">Let</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 158v24"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 120h50v38h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 128.5)" width="19" height="19">go</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 158v24"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 120h50v38h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 128.5)" width="15" height="19">to</text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M341 158v13h-20v11M341 158v13h55v11"/>
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 120h141v38H270z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 128.5)" width="38" height="19">N.Y.!”</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 158v24"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 120h40v38h-40z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 128.5)" width="11" height="19">s</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 181.8h30V220H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 190.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 181.8h50V220H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 190.5)" width="23" height="19">Let</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 181.8h50V220h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 190.5)" width="19" height="19">go</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 181.8h50V220h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 190.5)" width="15" height="19">to</text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M321 220v11h-20v12M321 220v11h34v12"/>
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 181.8h101V220H270z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(304.5 190.5)" width="30" height="19">N.Y.!</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 181.8h40V220h-40z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 190.5)" width="11" height="19">s</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 181.8h30V220h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 190.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 242.7h30V281H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 251.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 242.7h50V281H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 251.5)" width="23" height="19">Let</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 281v20-17 20"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 242.7h50V281h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 251.5)" width="19" height="19">go</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 242.7h50V281h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 251.5)" width="15" height="19">to</text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M301 281v23"/>
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" d="M270 242.7h61V281h-61z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(286.5 251.5)" width="26" height="19">N.Y.</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 242.7h40V281h-40z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 251.5)" width="11" height="19">s</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 242.7h30V281h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 251.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M355 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 242.7h30V281h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(351.5 251.5)" width="5" height="19">!</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 304h30v38H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 312.5)" width="7" height="19"></text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 304h50v38H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 312.5)" width="23" height="19">Let</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 304h50v38h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 312.5)" width="19" height="19">go</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 304h50v38h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 312.5)" width="15" height="19">to</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M270 304h61v38h-61z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(286.5 312.5)" width="26" height="19">N.Y.</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 304h40v38h-40z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 312.5)" width="11" height="19">s</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 304h30v38h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 312.5)" width="7" height="19"></text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 304h30v38h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(351.5 312.5)" width="5" height="19">!</text>
<rect width="104" height="19" x="437" y="72" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 74.5)" width="65" height="12">EXCEPTION</text>
<rect width="104" height="19" x="437" y="11" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 13.5)" width="43" height="12">PREFIX</text>
<rect width="104" height="19" x="437" y="130" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 132.5)" width="43" height="12">SUFFIX</text>
<rect width="104" height="19" x="437" y="191" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 193.5)" width="43" height="12">SUFFIX</text>
<rect width="104" height="19" x="437" y="252" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 254.5)" width="65" height="12">EXCEPTION</text>
<rect width="104" height="19" x="437" y="313" fill="#82b366" stroke="#82b366" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(473.5 315.5)" width="29" height="12">DONE</text>
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="598" height="386" viewBox="0 0 598 386">
<defs>
<path id="a" d="M51.3 10.9a4.3 4.3 0 01-.6-2.2c0-.6.2-1.2.5-1.9a6 6 0 011.4-1.6l.6.4.1.2-.1.3a7.5 7.5 0 00-.6 1l-.2.5a2.5 2.5 0 000 1.4l.3.8.1.2c0 .2 0 .3-.3.4l-1.2.5zm3.4 0a4.3 4.3 0 01-.7-2.2c0-.6.2-1.2.5-1.9A6 6 0 0156 5.2l.6.4h.1v.5a7.5 7.5 0 00-.7 1l-.2.5a2.5 2.5 0 000 1.4l.4.8v.2c0 .2 0 .3-.2.4l-1.2.5zm7.4 9.3H69V22h-9V6.2h2.2v14zM75 10.7a5 5 0 011.9.3 4.1 4.1 0 012.4 2.5c.2.7.3 1.4.3 2.2v.6l-.4.1h-7.5c0 .7.1 1.4.3 1.9.2.5.4 1 .7 1.3l1.1.8 1.5.2c.5 0 .9 0 1.2-.2a6 6 0 001.6-.7l.5-.2.3.2.6.7-.9.8-1 .5a6.9 6.9 0 01-4.6 0c-.7-.2-1.2-.6-1.7-1-.5-.6-.8-1.2-1.1-2a7.6 7.6 0 010-4.7 4.7 4.7 0 012.7-3c.6-.2 1.3-.3 2.1-.3zm0 1.4a3 3 0 00-2.2.8c-.5.6-.9 1.3-1 2.3h6l-.1-1.2c-.1-.4-.3-.7-.6-1-.2-.3-.5-.5-.9-.7a3 3 0 00-1.2-.2zm10.5 10c-.9 0-1.6-.2-2-.7-.5-.5-.7-1.2-.7-2v-6.9h-1.4l-.3-.1-.1-.3v-.8l1.8-.2.5-3.5.1-.3h1.3V11H88v1.4h-3.2v6.7c0 .5.1.8.4 1 .2.3.5.4.8.4l.6-.1a2.3 2.3 0 00.6-.4h.2l.3.1.6 1c-.3.3-.8.5-1.2.7l-1.5.3zm6.2-16.7a4.1 4.1 0 01.6 2.1 4 4 0 01-.5 2 6 6 0 01-1.3 1.6l-.6-.4-.2-.1v-.1l.1-.3a5.1 5.1 0 00.7-1l.2-.5a2.5 2.5 0 000-1.4l-.4-.8v-.2c0-.2 0-.3.2-.4l1.2-.5zm9.7 7.3c-.1.2-.3.2-.4.2h-.4a8.6 8.6 0 00-1.2-.6 3.4 3.4 0 00-2 0l-.6.3-.4.5-.2.7c0 .2.1.5.3.7l.6.5 1 .3a49.6 49.6 0 013 1.3l.7.8.2 1.2c0 .5 0 1-.3 1.4-.2.5-.4.8-.8 1.1a4 4 0 01-1.3.8c-.5.2-1.1.3-1.8.3a5.6 5.6 0 01-3.7-1.4l.4-.7.2-.2.4-.1.4.1.5.4.8.3 1 .2c.5 0 .8 0 1-.2.4 0 .6-.2.8-.4l.4-.6.2-.7c0-.3-.1-.5-.3-.7a2 2 0 00-.6-.6l-1-.3a68.4 68.4 0 01-2.1-.8l-1-.5-.6-.9c-.2-.3-.2-.7-.2-1.2a3 3 0 011-2.2c.3-.3.7-.6 1.2-.8a5 5 0 011.7-.2c.8 0 1.4.1 2 .3l1.5 1-.4.7z"/>
<path id="b" d="M183.5 10.7l1.4.1 1.1.5h3v.7c0 .3 0 .4-.4.5l-1.3.2c.3.4.4 1 .4 1.6a3.4 3.4 0 01-1.2 2.6c-.3.3-.8.5-1.3.7a5.6 5.6 0 01-3.2 0l-.5.5-.2.5c0 .3.1.5.4.6.2.2.5.3.8.3l1.2.1a36.1 36.1 0 012.7.3c.5 0 .9.2 1.2.4.4.2.7.4.9.7a3 3 0 010 2.6c-.3.5-.6 1-1 1.3-.5.3-1 .6-1.7.8a8.4 8.4 0 01-4.3 0 5 5 0 01-1.6-.6c-.4-.2-.7-.6-.9-1-.2-.3-.3-.6-.3-1 0-.6.2-1 .5-1.4.4-.4.9-.7 1.5-1a2 2 0 01-.8-.5c-.2-.3-.3-.6-.3-1l.1-.5c0-.2.2-.4.3-.5a2.9 2.9 0 011-1c-.5-.2-1-.6-1.2-1.2-.3-.5-.5-1-.5-1.7a3.3 3.3 0 011.2-2.6 4 4 0 011.3-.8l1.7-.2zm3.5 12c0-.4 0-.6-.2-.8l-.7-.4-.9-.2a13.9 13.9 0 00-2.2-.1l-1.2-.1-1 .7a1.5 1.5 0 00-.2 1.7l.6.6c.3.1.6.3 1 .3l1.4.2c.6 0 1 0 1.4-.2.5 0 .8-.2 1.1-.4.3-.1.5-.3.7-.6l.2-.8zm-3.5-6.1l1-.2c.4-.1.6-.3.8-.5l.5-.7.2-.9c0-.7-.2-1.2-.7-1.6-.4-.4-1-.6-1.8-.6s-1.4.2-1.8.6c-.4.4-.6 1-.6 1.6 0 .3 0 .6.2 1a2 2 0 001.2 1c.3.2.6.3 1 .3zm12-6c.8 0 1.6.2 2.2.5a4.7 4.7 0 012.8 3c.2.7.3 1.5.3 2.3 0 .9 0 1.7-.3 2.4s-.6 1.3-1 1.8c-.6.5-1.1.9-1.8 1.2-.6.2-1.4.4-2.2.4-.8 0-1.5-.2-2.2-.4-.6-.3-1.2-.7-1.7-1.2-.4-.5-.8-1.1-1-1.8a7 7 0 01-.4-2.4c0-.8.1-1.6.4-2.3.2-.8.6-1.4 1-1.9.5-.5 1-.8 1.7-1.1.7-.3 1.4-.4 2.2-.4zm0 10c1.1 0 2-.3 2.5-1 .5-.8.8-1.8.8-3.2 0-1.3-.3-2.3-.8-3-.5-.8-1.4-1.2-2.5-1.2-.5 0-1 .1-1.4.3-.4.2-.8.5-1 .8l-.7 1.4-.2 1.7.2 1.8.6 1.3c.3.4.7.6 1 .8l1.5.3z"/>
<path id="c" d="M250.4 22.2c-.8 0-1.5-.3-2-.8s-.7-1.2-.7-2v-6.9h-1.3l-.3-.1-.2-.3v-.8l1.9-.2.4-3.5c0-.1 0-.2.2-.3h1.3V11h3.2v1.4h-3.2v6.7c0 .5 0 .8.3 1 .2.3.5.4.9.4l.5-.1a2.3 2.3 0 00.7-.4h.2l.3.1.5 1a4.1 4.1 0 01-2.7 1zm9.4-11.5c.8 0 1.5.1 2.2.4a4.7 4.7 0 012.7 3 7.2 7.2 0 010 4.7c-.2.7-.6 1.3-1 1.8-.5.5-1 .9-1.7 1.2-.7.2-1.4.4-2.2.4-.8 0-1.6-.2-2.2-.4-.7-.3-1.2-.7-1.7-1.2s-.8-1.1-1-1.8a7 7 0 01-.4-2.4c0-.8 0-1.6.3-2.3a4.7 4.7 0 012.7-3c.7-.3 1.5-.4 2.3-.4zm0 10c1 0 2-.4 2.4-1.2.6-.7.9-1.7.9-3 0-1.4-.3-2.4-.9-3.2-.5-.7-1.3-1-2.4-1-.6 0-1 0-1.5.2l-1 .8c-.3.4-.5.8-.6 1.4l-.2 1.7c0 .7 0 1.3.2 1.8.1.5.3 1 .6 1.3.3.4.6.6 1 .8.4.2 1 .3 1.5.3z"/>
<path id="d" d="M347.6 6.2l.5.1.3.3 9.1 11.9a7.5 7.5 0 010-1.1V6.2h1.8V22h-1a1 1 0 01-.5 0 1 1 0 01-.3-.4l-9.1-11.9a14.1 14.1 0 010 1V22h-1.9V6.2h1.1zm14.6 14.6a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4l.5.1c.2 0 .4.2.5.3a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .4 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.4-1zm10-5V22h-2v-6.3l-5.9-9.5h2c.1 0 .3 0 .4.2l.3.3 3.6 6.2a7.6 7.6 0 01.6 1.4 13 13 0 01.6-1.4l3.6-6.2.3-.3.4-.2h2l-5.9 9.5zm5.2 5a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4l.5.1c.2 0 .3.2.4.3a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .4 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.3-1zm8.4-14.6v6.3a27.8 27.8 0 01-.2 4h-1.4a66.4 66.4 0 01-.2-4V6.2h1.8zm-2.3 14.6a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4l.5.1c.2 0 .3.2.4.3a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .4 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.3-1zm8.1-15.4a4.1 4.1 0 01.6 2.1 4 4 0 01-.5 2 6 6 0 01-1.3 1.6l-.6-.4-.2-.1v-.1l.1-.3a5.1 5.1 0 00.7-1l.2-.5a2.5 2.5 0 000-1.4l-.4-.8v-.2c0-.2 0-.3.2-.4l1.2-.5zm3.4 0a4.1 4.1 0 01.6 2.1 4 4 0 01-.5 2 6 6 0 01-1.4 1.6l-.6-.4-.1-.1v-.1-.3a5.1 5.1 0 00.7-1l.2-.5a2.5 2.5 0 000-1.4l-.4-.8v-.2c0-.2 0-.3.3-.4l1.2-.5z"/>
<path id="e" d="M13.5 77.8c-.5-.8-.8-1.6-.8-2.5 0-.7.2-1.4.6-2 .3-.7.9-1.4 1.6-2l.8.6.2.2V72.4l-.1.2a5.7 5.7 0 00-.6.8l-.2.6-.1.7.1.8c0 .3.2.5.4.9l.1.3c0 .2-.1.4-.4.5l-1.6.6zm3.6 0c-.5-.8-.7-1.6-.7-2.5 0-.7.2-1.4.5-2 .4-.7 1-1.4 1.6-2l.9.6.1.2v.5a5.7 5.7 0 00-.7.8l-.2.6v1.5l.5.9v.3c0 .2 0 .4-.3.5l-1.7.6z"/>
<path id="f" d="M80.8 86.8h6.8v1.7h-9V72.8h2.2v14zm12.9-9.6a5 5 0 011.8.4A4.1 4.1 0 0198 80c.2.6.3 1.3.3 2.1l-.1.6-.4.2h-7.4c0 .7.1 1.3.3 1.8.2.5.4 1 .7 1.3.3.4.7.6 1.1.8l1.5.3 1.2-.2a6 6 0 001.6-.7l.4-.2c.2 0 .3 0 .4.2l.6.7-1 .7c-.2.3-.6.4-1 .6a6.9 6.9 0 01-4.5 0c-.7-.3-1.2-.6-1.7-1.1-.5-.5-.9-1.2-1.1-1.9a7.6 7.6 0 010-4.7c.2-.7.5-1.3 1-1.8.4-.5 1-.9 1.6-1.2.6-.2 1.4-.4 2.2-.4zm0 1.5a3 3 0 00-2.2.8c-.5.5-.9 1.3-1 2.3h6l-.1-1.3-.6-1c-.2-.2-.5-.5-.9-.6a3 3 0 00-1.2-.2zm10.5 10c-.9 0-1.6-.2-2-.7-.5-.5-.8-1.2-.8-2.1V79h-1.6l-.1-.4v-.8l1.8-.2.5-3.4.1-.3.3-.1h1v3.8h3.2V79h-3.2v6.7c0 .5.1.8.3 1 .3.3.6.4 1 .4h.4a2.3 2.3 0 00.7-.4l.2-.1c.1 0 .2 0 .3.2l.6 1-1.3.7-1.4.2zm6.2-16.7a4.1 4.1 0 01.6 2 4 4 0 01-.5 2 6 6 0 01-1.4 1.6l-.6-.3V77h-.1l.1-.3a5.1 5.1 0 00.6-1l.2-.6a2.5 2.5 0 000-1.3c0-.3-.2-.6-.3-.8l-.1-.3c0-.1 0-.3.3-.4l1.2-.4zm9.6 7.2c0 .2-.2.3-.4.3l-.3-.1a8.6 8.6 0 00-1.3-.6 3.4 3.4 0 00-1.8 0l-.7.4-.5.5-.1.6c0 .3 0 .5.2.7l.7.5 1 .4a49.6 49.6 0 013 1.3c.2.2.5.5.6.8.2.3.3.7.3 1.1l-.3 1.5c-.2.4-.5.8-.8 1a4 4 0 01-1.3.8l-1.8.3a5.6 5.6 0 01-3.8-1.3l.5-.8.2-.2h.8l.5.4.7.4 1.2.1 1-.1.7-.4.4-.6.1-.7c0-.3 0-.6-.2-.8a2 2 0 00-.7-.5l-.9-.4a68.4 68.4 0 01-2.1-.7l-1-.6-.6-.8-.3-1.2a3 3 0 011-2.3l1.3-.7a5 5 0 011.7-.3c.7 0 1.4.1 2 .4.6.2 1 .5 1.5 1l-.5.6z"/>
<path id="g" d="M182.2 77.2c.4 0 .9 0 1.3.2.4 0 .8.2 1.2.4h3v.8c0 .2-.2.4-.5.4l-1.2.2c.2.5.3 1 .3 1.6a3.4 3.4 0 01-1.1 2.6c-.4.3-.8.6-1.4.7-.5.2-1 .3-1.6.3-.6 0-1 0-1.5-.2l-.5.5c-.2.2-.2.3-.2.5s0 .4.3.6l.8.3h1.2a36.1 36.1 0 012.8.3l1.2.4.8.8c.2.3.3.7.3 1.2s0 1-.3 1.4c-.3.5-.6.9-1 1.2a7 7 0 01-3.8 1.1c-.9 0-1.6 0-2.2-.2a5 5 0 01-1.5-.6l-1-1-.2-1c0-.6.1-1.1.5-1.5.3-.4.8-.7 1.4-1a2 2 0 01-.7-.5c-.2-.2-.3-.6-.3-1v-.5l.3-.5a2.9 2.9 0 011.1-.9c-.5-.3-1-.7-1.3-1.2-.3-.5-.5-1.1-.5-1.8 0-.5.1-1 .4-1.5l.8-1.1a4 4 0 011.4-.7c.5-.2 1-.3 1.7-.3zm3.4 12c0-.3 0-.6-.2-.7-.1-.2-.4-.3-.6-.4l-1-.2a13.9 13.9 0 00-2.2-.2h-1.1c-.4.1-.8.4-1 .6a1.5 1.5 0 00-.2 1.8c.1.2.3.4.6.5.2.2.6.3 1 .4l1.4.1 1.4-.1c.4-.1.8-.2 1-.4l.7-.6c.2-.3.2-.6.2-.8zm-3.4-6.1c.4 0 .7 0 1-.2.3 0 .6-.2.8-.4l.4-.7.2-1c0-.6-.2-1.2-.6-1.6-.4-.4-1-.6-1.8-.6s-1.4.2-1.8.6a2.5 2.5 0 00-.5 2.5 2 2 0 001.2 1.2l1 .2zm12-5.9c.8 0 1.5.2 2.2.4a4.7 4.7 0 012.7 3c.2.7.4 1.5.4 2.4 0 .8-.2 1.6-.4 2.3-.2.7-.6 1.3-1 1.8-.5.5-1 1-1.7 1.2-.7.3-1.4.4-2.2.4-.8 0-1.6-.1-2.2-.4-.7-.3-1.3-.7-1.7-1.2-.5-.5-.8-1-1-1.8a7 7 0 01-.5-2.3c0-.9.2-1.7.4-2.4.3-.7.6-1.3 1-1.8.5-.5 1.1-.9 1.8-1.2.6-.2 1.4-.4 2.2-.4zm0 10c1 0 1.9-.4 2.4-1.1.6-.8.8-1.8.8-3.1s-.2-2.4-.8-3.1c-.5-.8-1.3-1.1-2.4-1.1-.6 0-1 0-1.5.3-.4.1-.7.4-1 .8-.3.3-.5.8-.6 1.3-.2.5-.2 1.1-.2 1.8 0 .6 0 1.2.2 1.8.1.5.3 1 .6 1.3l1 .8c.4.2 1 .3 1.5.3z"/>
<path id="h" d="M249 88.7c-1 0-1.6-.2-2-.7-.6-.5-.8-1.2-.8-2.1V79h-1.6l-.2-.4v-.8l1.9-.2.4-3.4c0-.2 0-.2.2-.3l.3-.1h1v3.8h3.2V79h-3.2v6.7c0 .5 0 .8.3 1 .2.3.5.4.9.4h.5a2.3 2.3 0 00.7-.4l.2-.1c.1 0 .2 0 .3.2l.5 1-1.2.7-1.5.2zm9.3-11.5c.8 0 1.5.2 2.2.4a4.7 4.7 0 012.7 3c.3.7.4 1.5.4 2.4 0 .8-.1 1.6-.4 2.3-.2.7-.6 1.3-1 1.8-.5.5-1 1-1.7 1.2-.7.3-1.4.4-2.2.4-.8 0-1.6-.1-2.2-.4-.7-.3-1.2-.7-1.7-1.2s-.8-1-1-1.8a7 7 0 01-.4-2.3c0-.9 0-1.7.3-2.4s.6-1.3 1.1-1.8c.5-.5 1-.9 1.7-1.2.6-.2 1.4-.4 2.2-.4zm0 10c1 0 2-.4 2.4-1.1.6-.8.9-1.8.9-3.1s-.3-2.4-.9-3.1c-.5-.8-1.3-1.1-2.4-1.1-.6 0-1 0-1.5.3-.4.1-.7.4-1 .8-.3.3-.5.8-.6 1.3L255 83c0 .6 0 1.2.2 1.8.1.5.3 1 .6 1.3l1 .8c.4.2 1 .3 1.5.3z"/>
<path id="i" d="M347.2 72.8h.5l.3.3 9.1 12a7.5 7.5 0 010-1.2V72.8h1.8v15.7h-1a1 1 0 01-.5 0 1 1 0 01-.3-.3l-9.1-12a14.1 14.1 0 010 1.1v11.2h-1.9V72.8h1.1zm14.6 14.5a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4c.2 0 .4 0 .5.2.2 0 .3.1.5.3a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .3 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.4-1zm10-5v6.2h-2v-6.2l-5.9-9.5h2l.4.1.2.4 3.7 6.1a7.6 7.6 0 01.6 1.4 13 13 0 01.6-1.4l3.6-6.1.3-.4.4-.1h2l-5.9 9.5zm5.2 5a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4c.1 0 .3 0 .5.2.2 0 .3.1.4.3a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .3 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.3-1zm8.4-14.5V79a27.8 27.8 0 01-.3 4h-1.3a66.4 66.4 0 01-.3-4v-6.3h2zm-2.3 14.5a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4c.1 0 .3 0 .5.2l.4.3a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .3 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.3-1zm8.1-15.3a4.1 4.1 0 01.6 2 4 4 0 01-.5 2 6 6 0 01-1.3 1.6l-.6-.3-.2-.2.1-.3a5.1 5.1 0 00.7-1l.2-.6a2.5 2.5 0 000-1.3l-.4-.8v-.3c0-.1 0-.3.2-.4l1.2-.4zm3.3 0a4.1 4.1 0 01.7 2 4 4 0 01-.5 2 6 6 0 01-1.4 1.6l-.6-.3-.1-.2v-.3a5.1 5.1 0 00.7-1l.2-.6a2.5 2.5 0 000-1.3c0-.3-.2-.6-.4-.8v-.3c0-.1 0-.3.2-.4l1.2-.4z"/>
<path id="j" d="M13.5 141c-.5-.7-.8-1.6-.8-2.4 0-.7.2-1.4.6-2.1.3-.7.9-1.3 1.6-1.8l.8.5.2.1v.4l-.1.2a5.7 5.7 0 00-.6.8l-.2.6-.1.6.1.8c0 .3.2.6.4 1l.1.2c0 .3-.1.4-.4.5l-1.6.7zm3.6 0c-.5-.7-.7-1.6-.7-2.4 0-.7.2-1.4.5-2.1.4-.7 1-1.3 1.6-1.8l.9.5.1.1V136a5.7 5.7 0 00-.7.8l-.2.6v1.4l.5 1v.2c0 .3 0 .4-.3.5l-1.7.7z"/>
<path id="k" d="M60 149.4h6.3v2.4h-9.4V136h3v13.5zm12.4-9c.7 0 1.4 0 2 .3a4.3 4.3 0 012.5 2.6 6 6 0 01.4 2.7l-.1.3-.2.2h-7.3c0 1.2.4 2 1 2.6a3 3 0 002 .8c.5 0 1 0 1.2-.2l.9-.3.6-.4.5-.1h.3l.2.2.8 1-1 1a5.7 5.7 0 01-2.4.8h-1.3a6 6 0 01-2.1-.3c-.7-.3-1.3-.7-1.8-1.2s-.9-1.1-1.2-1.9a7.3 7.3 0 010-4.7c.2-.7.6-1.3 1-1.8a5 5 0 011.7-1.2c.7-.3 1.5-.4 2.3-.4zm0 1.9c-.7 0-1.3.2-1.8.7-.4.4-.7 1-.8 1.9h5v-1l-.5-.8a2 2 0 00-.8-.6l-1-.2zm10.8 9.7c-1 0-1.7-.3-2.2-.8-.6-.6-.8-1.4-.8-2.3v-6.3H79c-.1 0-.3 0-.4-.2l-.1-.4v-1l1.8-.4.6-3c0-.2 0-.3.2-.4l.4-.1h1.4v3.5h3v2h-3v6c0 .4 0 .7.2 1l.8.2h.4a2.3 2.3 0 00.5-.3h.4l.2.2.8 1.3c-.4.3-.9.6-1.4.7a5 5 0 01-1.6.3z"/>
<path id="l" d="M183 140.3c.4 0 .9 0 1.3.2.4 0 .8.2 1.2.4h3.2v1l-.1.4c0 .1-.2.2-.5.2l-1 .2a3.5 3.5 0 01.3 1.3 3.4 3.4 0 01-1.3 2.6c-.4.4-.9.6-1.4.8a5.7 5.7 0 01-3 .1c-.3.2-.5.5-.5.7 0 .3 0 .4.3.5l.8.3h1.2a24.2 24.2 0 012.6.3l1.2.4.8.8c.2.4.3.8.3 1.4 0 .5 0 1-.3 1.4l-1 1.3-1.8.9a8.9 8.9 0 01-4.5 0c-.7-.1-1.2-.3-1.6-.6l-1-1-.2-1c0-.6.1-1 .4-1.4.4-.4.8-.7 1.4-.9-.3-.1-.5-.3-.7-.6-.2-.3-.2-.6-.2-1v-.5l.3-.6.5-.5.6-.4a3.3 3.3 0 01-1.8-3 3.4 3.4 0 011.2-2.7c.4-.3 1-.5 1.5-.7a6 6 0 011.8-.3zm3 12c0-.2-.1-.4-.3-.5 0-.2-.3-.3-.5-.3a14.7 14.7 0 00-2.8-.3l-1-.1c-.4.1-.6.3-.8.6-.2.2-.3.5-.3.8 0 .2 0 .3.2.5 0 .2.2.3.4.5l.9.2 1.2.2c.5 0 1 0 1.3-.2.4 0 .7-.1 1-.3l.5-.5.1-.6zm-3-6.4l.8-.1.7-.4.3-.6.2-.7c0-.6-.2-1-.5-1.4-.4-.3-.9-.5-1.5-.5-.7 0-1.2.2-1.5.5-.4.4-.5.8-.5 1.4v.7a1.6 1.6 0 001 1l1 .1zm12.3-5.5c.8 0 1.6 0 2.2.4a5 5 0 013 3c.2.7.3 1.5.3 2.4a7 7 0 01-.4 2.4 4.9 4.9 0 01-2.9 3 6.2 6.2 0 01-4.6 0 5 5 0 01-2.8-3c-.3-.7-.4-1.6-.4-2.4 0-1 0-1.7.4-2.5.2-.7.6-1.3 1-1.8a5 5 0 011.9-1.1c.6-.3 1.4-.4 2.3-.4zm0 9.5c.9 0 1.6-.3 2-1 .5-.6.7-1.5.7-2.7 0-1.2-.2-2.2-.7-2.8-.4-.6-1.1-1-2-1-1 0-1.7.4-2.2 1-.4.6-.6 1.6-.6 2.8 0 1.2.2 2.1.6 2.7.5.7 1.2 1 2.2 1z"/>
<path id="m" d="M251.3 152c-1 0-1.7-.3-2.2-.8-.6-.6-.8-1.4-.8-2.3v-6.3H247c-.1 0-.2 0-.3-.2l-.2-.4v-1l1.8-.4.6-3c0-.2 0-.3.2-.4l.4-.1h1.4v3.5h3v2h-3v6c0 .4 0 .7.3 1l.7.2h.4a2.3 2.3 0 00.5-.3h.4l.2.2.8 1.3c-.4.3-.9.6-1.4.7a5 5 0 01-1.6.3zm9.7-11.6c.8 0 1.6 0 2.2.4a5 5 0 013 3c.2.7.3 1.5.3 2.4a7 7 0 01-.4 2.4 4.9 4.9 0 01-2.9 3 6.2 6.2 0 01-4.6 0 5 5 0 01-2.8-3c-.3-.7-.4-1.6-.4-2.4 0-1 0-1.7.4-2.5.2-.7.6-1.3 1-1.8a5 5 0 011.9-1.1c.6-.3 1.4-.4 2.3-.4zm0 9.5c.9 0 1.6-.3 2-1 .5-.6.7-1.5.7-2.7 0-1.2-.2-2.2-.7-2.8-.4-.6-1.1-1-2-1-1 0-1.7.4-2.2 1-.4.6-.6 1.6-.6 2.8 0 1.2.2 2.1.6 2.7.5.7 1.2 1 2.2 1z"/>
<path id="n" d="M347.7 136l.5.1.3.3 9.1 11.9a7.5 7.5 0 010-1V136h1.8v15.7h-1a1 1 0 01-.5 0 1 1 0 01-.3-.4l-9.1-11.8a14.1 14.1 0 010 1v11.2h-1.9v-15.7h1.1zm14.6 14.6a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4l.5.1.5.3a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .4 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.4-1zm10-5v6.2h-2v-6.3l-5.9-9.4h2l.4.1.2.4 3.7 6a7.6 7.6 0 01.6 1.5 13 13 0 01.6-1.4l3.6-6.1c0-.2.2-.3.3-.4l.4-.1h2l-5.9 9.4zm5.2 5a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4l.5.1.4.3a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .4 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.3-1zm8.4-14.5v6.2a27.8 27.8 0 01-.3 4h-1.3a66.4 66.4 0 01-.3-4v-6.2h2zm-2.3 14.5a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4l.5.1.4.3a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .4 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.3-1zm8.1-15.4a4.1 4.1 0 01.6 2.1 4 4 0 01-.5 2 6 6 0 01-1.3 1.6l-.6-.4h-.2v-.2l.1-.3a5.1 5.1 0 00.7-1l.2-.5a2.5 2.5 0 000-1.4l-.4-.8v-.2c0-.2 0-.3.2-.4l1.2-.5zm3.3 0a4.1 4.1 0 01.7 2.1 4 4 0 01-.5 2 6 6 0 01-1.4 1.6l-.6-.4h-.1v-.2-.3a5.1 5.1 0 00.7-1l.2-.5a2.5 2.5 0 000-1.4l-.4-.8v-.2c0-.2 0-.3.2-.4l1.2-.5z"/>
<path id="o" d="M127 135l.6 1.2a4.3 4.3 0 01-.4 3.3c-.4.7-.9 1.3-1.6 1.9l-.8-.5-.2-.2v-.2l.1-.3a6.5 6.5 0 00.6-.9 2.9 2.9 0 00.3-1.2l-.1-.8c0-.3-.2-.6-.4-.9l-.1-.3c0-.2.1-.4.4-.5l1.6-.6zm9.7 7.7l-.2.3h-.3a29 29 0 00-1.6-.6h-1a2 2 0 00-1.2.3 1 1 0 00-.5.9l.3.6.6.4.9.3a29 29 0 012 .8l.9.5.6.9c.2.3.3.7.3 1.1 0 .6-.1 1-.3 1.5-.2.5-.5.9-.9 1.2a4 4 0 01-1.4.8 6.1 6.1 0 01-3 .2 6.7 6.7 0 01-2.1-.7l-.8-.6.7-1c0-.2.1-.2.3-.3l.4-.1.4.1a10.6 10.6 0 001.3.6l1 .2.8-.1.6-.3.3-.5.1-.5c0-.2 0-.5-.2-.6a2 2 0 00-.6-.5 29.4 29.4 0 01-3-1l-.9-.6-.6-1c-.2-.3-.2-.7-.2-1.2a3.3 3.3 0 011-2.4 4 4 0 011.4-.8c.5-.2 1.1-.2 1.8-.2a4.8 4.8 0 013.7 1.4l-.6 1z"/>
<path id="p" d="M13.5 208.7c-.5-.8-.8-1.6-.8-2.5 0-.7.2-1.4.6-2 .3-.7.9-1.3 1.6-2l.8.6.2.2V203.3l-.1.2a5.7 5.7 0 00-.6.9l-.2.5-.1.7.1.8c0 .3.2.6.4.9l.1.3c0 .2-.1.4-.4.5l-1.6.6zm3.6 0c-.5-.8-.7-1.6-.7-2.5 0-.7.2-1.4.5-2 .4-.7 1-1.3 1.6-2l.9.6.1.2v.5a5.7 5.7 0 00-.7.9l-.2.5v1.5l.5.9v.3c0 .2 0 .4-.3.5l-1.7.6z"/>
<path id="q" d="M60 217h6.3v2.5h-9.4v-16h3V217zm12.4-9c.7 0 1.4.1 2 .3a4.3 4.3 0 012.5 2.6 6 6 0 01.4 2.7l-.1.3-.2.2h-7.3c0 1.2.4 2 1 2.6a3 3 0 002 .8l1.2-.1.9-.4.6-.3.5-.2h.3l.2.3.8 1-1 .9a5.7 5.7 0 01-2.4.8l-1.3.1a6 6 0 01-2.1-.4c-.7-.2-1.3-.6-1.8-1.1-.5-.5-.9-1.2-1.2-2a7.3 7.3 0 010-4.7c.2-.7.6-1.3 1-1.8a5 5 0 011.7-1.2c.7-.3 1.5-.4 2.3-.4zm0 2c-.7 0-1.3.2-1.8.6-.4.5-.7 1-.8 2h5v-1l-.5-.9a2 2 0 00-.8-.6l-1-.2zm10.8 9.6c-1 0-1.7-.2-2.2-.8-.6-.6-.8-1.3-.8-2.3v-6.3H79l-.4-.1-.1-.5v-1l1.8-.3.6-3.1c0-.2 0-.3.2-.4H82.9v3.5h3v1.9h-3v6.1c0 .4 0 .6.2.8.2.2.5.3.8.3h.4a2.3 2.3 0 00.5-.3h.4l.2.2.8 1.3c-.4.4-.9.6-1.4.8a5 5 0 01-1.6.2z"/>
<path id="r" d="M185.5 208l1.3.1 1.2.5h3.2v1l-.1.4-.5.2-1 .1a3.5 3.5 0 01.3 1.3 3.4 3.4 0 01-1.3 2.7l-1.4.7a5.7 5.7 0 01-3 .2c-.3.2-.5.4-.5.7 0 .2 0 .4.3.5l.8.2h1.2a24.2 24.2 0 012.6.3l1.2.5c.3.2.6.4.8.8.2.3.3.8.3 1.3s0 1-.3 1.5l-1 1.2c-.6.4-1.1.7-1.8.9a8.9 8.9 0 01-4.5 0c-.7 0-1.2-.3-1.6-.6l-1-1-.2-1c0-.6.1-1 .4-1.4.4-.3.8-.6 1.4-.8l-.7-.7c-.2-.2-.2-.6-.2-1v-.5l.3-.5.5-.5.6-.4c-.6-.3-1-.8-1.3-1.3-.4-.5-.5-1-.5-1.8a3.4 3.4 0 011.2-2.6c.4-.4 1-.6 1.5-.8a6 6 0 011.8-.2zm3 12c0-.3-.1-.4-.3-.6l-.5-.3a14.7 14.7 0 00-2.8-.3l-1-.1-.8.6c-.2.2-.3.5-.3.8 0 .2 0 .4.2.5 0 .2.2.4.4.5l.9.3h2.5l1-.3.5-.5.1-.6zm-3-6.5l.8-.1c.3 0 .5-.2.7-.4l.3-.6.2-.7c0-.6-.2-1-.5-1.3-.4-.4-.9-.5-1.5-.5-.7 0-1.2.1-1.5.5-.4.3-.5.7-.5 1.3v.7a1.6 1.6 0 001 1l1 .1zm12.3-5.5c.8 0 1.6.1 2.2.4a5 5 0 013 3c.2.7.3 1.5.3 2.4a7 7 0 01-.4 2.4 4.9 4.9 0 01-2.9 3c-.6.3-1.4.4-2.2.4-.9 0-1.7-.1-2.3-.4a5 5 0 01-3-3c-.2-.7-.3-1.5-.3-2.4 0-.9 0-1.7.4-2.4.2-.7.6-1.3 1-1.8a5 5 0 011.9-1.2c.6-.3 1.4-.4 2.3-.4zm0 9.5c.9 0 1.6-.3 2-1 .5-.5.7-1.5.7-2.7 0-1.2-.2-2.1-.7-2.8-.4-.6-1.1-1-2-1-1 0-1.7.4-2.2 1-.4.7-.6 1.6-.6 2.8 0 1.2.2 2.1.6 2.8.5.6 1.2 1 2.2 1z"/>
<path id="s" d="M252.8 219.6c-1 0-1.7-.2-2.2-.8-.6-.6-.8-1.3-.8-2.3v-6.3h-1.2l-.3-.1-.2-.5v-1l1.8-.3.6-3.1c0-.2 0-.3.2-.4H252.5v3.5h3v1.9h-3v6.1c0 .4 0 .6.3.8.1.2.4.3.7.3h.4a2.3 2.3 0 00.5-.3h.4l.2.2.8 1.3c-.4.4-.9.6-1.4.8a5 5 0 01-1.6.2zm9.7-11.6c.8 0 1.6.1 2.2.4a5 5 0 013 3c.2.7.3 1.5.3 2.4a7 7 0 01-.4 2.4 4.9 4.9 0 01-2.9 3c-.6.3-1.4.4-2.2.4-.9 0-1.7-.1-2.3-.4a5 5 0 01-3-3c-.2-.7-.3-1.5-.3-2.4 0-.9 0-1.7.4-2.4.2-.7.6-1.3 1-1.8a5 5 0 011.9-1.2c.6-.3 1.4-.4 2.3-.4zm0 9.5c.9 0 1.6-.3 2-1 .5-.5.7-1.5.7-2.7 0-1.2-.2-2.1-.7-2.8-.4-.6-1.1-1-2-1-1 0-1.7.4-2.2 1-.4.7-.6 1.6-.6 2.8 0 1.2.2 2.1.6 2.8.5.6 1.2 1 2.2 1z"/>
<path id="t" d="M331.8 203.7h.4l.3.4 9.2 11.8a7.5 7.5 0 01-.1-1v-11.2h1.9v15.8h-1.1a1 1 0 01-.4-.1 1 1 0 01-.4-.3l-9-11.9a14.1 14.1 0 010 1v11.3h-2v-15.8h1.2zm14.6 14.5a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.3h.5l.4.4a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .3 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.3-1zm10-5v6.3h-2.1v-6.3l-5.8-9.5h1.9l.4.1.3.4 3.6 6.1a7.6 7.6 0 01.6 1.4 13 13 0 01.7-1.4l3.6-6.1.2-.3c.1-.2.3-.2.5-.2h1.9l-5.8 9.5zm5.1 5a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.3h.5l.5.4a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .3 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.4-1zm8.5-14.5v6.3a27.8 27.8 0 01-.3 4h-1.3a66.4 66.4 0 01-.3-4v-6.3h1.9zm-2.4 14.5a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.3h.5l.5.4a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .3 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.4-1z"/>
<path id="u" d="M127 202.6l.6 1.2a4.3 4.3 0 01-.4 3.4c-.4.6-.9 1.3-1.6 1.8l-.8-.5-.2-.2v-.1l.1-.4a6.5 6.5 0 00.6-.8 2.9 2.9 0 00.3-1.3l-.1-.8c0-.3-.2-.6-.4-.9l-.1-.3c0-.2.1-.4.4-.5l1.6-.6zm9.7 7.8l-.2.2h-.3a29 29 0 00-1.6-.6h-1a2 2 0 00-1.2.3 1 1 0 00-.5.9c0 .2.1.5.3.6.1.2.3.3.6.4l.9.4a29 29 0 012 .7l.9.6c.3.2.5.5.6.8.2.3.3.7.3 1.2l-.3 1.5-.9 1.2a4 4 0 01-1.4.8 6.1 6.1 0 01-3 .1 6.7 6.7 0 01-2.1-.7l-.8-.6.7-1 .3-.3h.8a10.6 10.6 0 001.3.7l1 .1h.8l.6-.4.3-.4.1-.5c0-.3 0-.5-.2-.7a2 2 0 00-.6-.4 29.4 29.4 0 01-3-1l-.9-.7-.6-.9c-.2-.3-.2-.8-.2-1.3a3.3 3.3 0 011-2.4 4 4 0 011.4-.7 5.6 5.6 0 014 .1c.6.2 1.1.6 1.5 1l-.6 1z"/>
<path id="v" d="M429.6 202.6l.5 1.2a4.3 4.3 0 01-.3 3.4c-.4.6-1 1.3-1.6 1.8l-.9-.5-.1-.2v-.1-.4a7.8 7.8 0 00.7-.8l.2-.6a2.5 2.5 0 000-1.5l-.5-.9v-.3c0-.2 0-.4.3-.5l1.7-.6zm3.6 0l.6 1.2a4.3 4.3 0 01-.4 3.4c-.3.6-.9 1.3-1.6 1.8l-.8-.5-.2-.2v-.1l.1-.4a7.8 7.8 0 00.6-.8l.2-.6a2.5 2.5 0 000-1.5c0-.3-.2-.6-.4-.9l-.1-.3c0-.2.1-.4.4-.5l1.6-.6z"/>
<path id="w" d="M13.5 275.3c-.5-.9-.8-1.7-.8-2.5 0-.7.2-1.4.6-2.1.3-.7.9-1.3 1.6-1.9l.8.6.2.1v.4l-.1.1a5.7 5.7 0 00-.6.9l-.2.6-.1.6.1.8c0 .3.2.6.4 1l.1.2c0 .3-.1.4-.4.5l-1.6.7zm3.6 0c-.5-.9-.7-1.7-.7-2.5 0-.7.2-1.4.5-2.1.4-.7 1-1.3 1.6-1.9l.9.6.1.1v.5a5.7 5.7 0 00-.7.9l-.2.6v1.4l.5 1v.2c0 .3 0 .4-.3.5l-1.7.7z"/>
<path id="x" d="M58.4 283.6h6.4v2.4h-9.4v-16h3v13.6zm12.5-9c.7 0 1.4 0 2 .3a4.3 4.3 0 012.5 2.6 6 6 0 01.4 2.7l-.1.3-.2.1-.3.1h-7c0 1.2.4 2 1 2.6a3 3 0 002 .8c.5 0 1 0 1.2-.2l.9-.3.6-.4.5-.1h.3l.2.2.8 1c-.3.4-.6.7-1 .9a5.7 5.7 0 01-2.4.9H71a6 6 0 01-2.1-.3c-.7-.3-1.3-.7-1.8-1.2s-.9-1.1-1.2-1.9a7.3 7.3 0 010-4.8c.2-.6.6-1.2 1-1.7a5 5 0 011.7-1.2c.7-.3 1.5-.5 2.3-.5zm0 1.9c-.7 0-1.3.2-1.8.7-.4.4-.7 1-.8 1.9h5v-1l-.5-.9a2 2 0 00-.8-.5l-1-.2zm10.8 9.7c-1 0-1.7-.3-2.2-.9-.6-.5-.8-1.3-.8-2.2v-6.4H77l-.1-.5V275l1.8-.3.6-3c0-.2 0-.3.2-.4l.4-.1h1.4v3.5h3v2h-3v6c0 .4 0 .7.2.9.2.2.5.3.8.3h.4a2.3 2.3 0 00.5-.3h.4l.2.2.8 1.3c-.4.3-.9.6-1.4.7a5 5 0 01-1.6.3z"/>
<path id="y" d="M183 274.5c.4 0 .9 0 1.3.2.4 0 .8.2 1.2.4h3.2v1l-.1.4c0 .1-.2.2-.5.2l-1 .2a3.5 3.5 0 01.3 1.3 3.4 3.4 0 01-1.3 2.6l-1.4.8a5.7 5.7 0 01-3 .1c-.3.2-.5.4-.5.7 0 .2 0 .4.3.5l.8.2 1.2.1a24.2 24.2 0 012.6.3c.5 0 .9.2 1.2.4l.8.8c.2.4.3.8.3 1.3s0 1-.3 1.5l-1 1.3c-.6.3-1.1.6-1.8.8-.7.3-1.4.4-2.3.4-.9 0-1.6-.1-2.2-.3-.7-.1-1.2-.4-1.6-.6l-1-1-.2-1.1c0-.5.1-1 .4-1.3.4-.4.8-.7 1.4-.9l-.7-.6c-.2-.3-.2-.6-.2-1v-.5l.3-.6.5-.5.6-.4a3.3 3.3 0 01-1.8-3 3.4 3.4 0 011.2-2.7 6 6 0 013.2-1zm3 12c0-.2-.1-.4-.3-.5 0-.2-.3-.3-.5-.4a14.7 14.7 0 00-2.8-.3h-1c-.4.1-.6.3-.8.5-.2.3-.3.5-.3.8 0 .2 0 .4.2.6 0 .2.2.3.4.4.2.2.5.3.9.3l1.2.1h1.3l1-.4.5-.5.1-.6zm-3-6.4c.3 0 .6 0 .8-.2.3 0 .5-.2.7-.3l.3-.6.2-.8c0-.5-.2-1-.5-1.3-.4-.3-.9-.5-1.5-.5-.7 0-1.2.2-1.5.5-.4.3-.5.8-.5 1.3v.8a1.6 1.6 0 001 1h1zm12.3-5.6c.8 0 1.6.2 2.2.4a5 5 0 013 3c.2.7.3 1.5.3 2.4a7 7 0 01-.4 2.5 4.9 4.9 0 01-2.9 3 6.2 6.2 0 01-4.6 0 5 5 0 01-2.8-3c-.3-.8-.4-1.6-.4-2.5 0-.9 0-1.7.4-2.4.2-.7.6-1.3 1-1.8a5 5 0 011.9-1.2c.6-.2 1.4-.4 2.3-.4zm0 9.6c.9 0 1.6-.3 2-1 .5-.6.7-1.5.7-2.7 0-1.3-.2-2.2-.7-2.8-.4-.7-1.1-1-2-1-1 0-1.7.3-2.2 1-.4.6-.6 1.5-.6 2.8 0 1.2.2 2 .6 2.7.5.7 1.2 1 2.2 1z"/>
<path id="z" d="M251.3 286.2c-1 0-1.7-.3-2.2-.9-.6-.5-.8-1.3-.8-2.2v-6.4h-1.5l-.2-.5V275l1.8-.3.6-3c0-.2 0-.3.2-.4l.4-.1h1.4v3.5h3v2h-3v6c0 .4 0 .7.3.9.1.2.4.3.7.3h.4a2.3 2.3 0 00.5-.3h.4l.2.2.8 1.3c-.4.3-.9.6-1.4.7a5 5 0 01-1.6.3zm9.7-11.7c.8 0 1.6.2 2.2.4a5 5 0 013 3c.2.7.3 1.5.3 2.4a7 7 0 01-.4 2.5 4.9 4.9 0 01-2.9 3 6.2 6.2 0 01-4.6 0 5 5 0 01-2.8-3c-.3-.8-.4-1.6-.4-2.5 0-.9 0-1.7.4-2.4.2-.7.6-1.3 1-1.8a5 5 0 011.9-1.2c.6-.2 1.4-.4 2.3-.4zm0 9.6c.9 0 1.6-.3 2-1 .5-.6.7-1.5.7-2.7 0-1.3-.2-2.2-.7-2.8-.4-.7-1.1-1-2-1-1 0-1.7.3-2.2 1-.4.6-.6 1.5-.6 2.8 0 1.2.2 2 .6 2.7.5.7 1.2 1 2.2 1z"/>
<path id="A" d="M310.4 270.2l.4.1.4.3 9 11.9a7.5 7.5 0 010-1.1v-11.2h2V286H321a1 1 0 01-.4 0 1 1 0 01-.3-.4l-9.2-11.9a14.1 14.1 0 010 1V286h-1.8v-15.8h1.1zm14.6 14.6a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4l.5.1c.2 0 .3.2.5.3a1.3 1.3 0 01.3 1 1.4 1.4 0 01-.3 1 1.3 1.3 0 01-1 .4 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.4-1zm10-5v6.2H333v-6.3l-5.8-9.5h1.9c.2 0 .3 0 .4.2.2 0 .2.2.3.3l3.6 6.2a7.6 7.6 0 01.7 1.4 13 13 0 01.6-1.4l3.6-6.2.3-.3.4-.2h2l-5.9 9.5zm5.2 5a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4l.5.1.4.3a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .4 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.3-1z"/>
<path id="B" d="M126.5 269.1l.6 1.3a4.3 4.3 0 01-.4 3.3c-.4.7-.9 1.3-1.6 1.9l-.8-.6-.2-.1v-.2l.1-.3a6.5 6.5 0 00.6-.9 2.9 2.9 0 00.3-1.2l-.1-.8c0-.3-.2-.6-.4-1l-.1-.2c0-.3.1-.4.4-.5l1.6-.7zm9.7 7.8l-.2.3h-.3a29 29 0 00-1.6-.6h-1a2 2 0 00-1.2.3 1 1 0 00-.5.9l.3.6.6.4.9.3a29 29 0 012 .8l.9.5c.3.3.5.5.6.9.2.3.3.7.3 1.1 0 .6-.1 1-.3 1.5-.2.5-.5.9-.9 1.2a4 4 0 01-1.4.8 6.1 6.1 0 01-3 .2 6.7 6.7 0 01-2.1-.8l-.8-.5.7-1c0-.2.1-.3.3-.3l.4-.1.4.1a10.6 10.6 0 001.3.6l1 .2.8-.1.6-.3.3-.5.1-.5c0-.3 0-.5-.2-.6a2 2 0 00-.6-.5 29.4 29.4 0 01-3-1l-.9-.7-.6-.8c-.2-.4-.2-.8-.2-1.3a3.3 3.3 0 011-2.4 4 4 0 011.4-.8 5.6 5.6 0 014 .1c.6.3 1.1.6 1.5 1l-.6 1z"/>
<path id="C" d="M429.6 269.1l.5 1.3a4.3 4.3 0 01-.3 3.3c-.4.7-1 1.3-1.6 1.9l-.9-.6-.1-.1v-.2-.3a7.8 7.8 0 00.7-.9l.2-.6a2.5 2.5 0 000-1.4l-.5-1v-.2c0-.3 0-.4.3-.5l1.7-.7zm3.6 0l.6 1.3a4.3 4.3 0 01-.4 3.3c-.3.7-.9 1.3-1.6 1.9l-.8-.6-.2-.1v-.2l.1-.3a7.8 7.8 0 00.6-.9l.2-.6a2.5 2.5 0 000-1.4c0-.3-.2-.6-.4-1l-.1-.2c0-.3.1-.4.4-.5l1.6-.7z"/>
<path id="D" d="M387.8 270v6.4a19.2 19.2 0 01-.3 4h-1.9a41.8 41.8 0 01-.3-4V270h2.5zm-3 14.5a1.7 1.7 0 01.5-1.2 1.7 1.7 0 011.2-.5 1.6 1.6 0 011.2.5 1.7 1.7 0 01.3 1.9c0 .2-.2.3-.3.5l-.5.3-.7.2a1.7 1.7 0 01-1.2-.5l-.3-.5-.2-.7z"/>
<path id="E" d="M13.5 341.8c-.5-.8-.8-1.6-.8-2.5 0-.7.2-1.4.6-2 .3-.7.9-1.4 1.6-2l.8.6.2.2V336.4l-.1.2a5.7 5.7 0 00-.6.8l-.2.6-.1.7.1.8c0 .3.2.5.4.9l.1.3c0 .2-.1.4-.4.5l-1.6.6zm3.6 0c-.5-.8-.7-1.6-.7-2.5 0-.7.2-1.4.5-2 .4-.7 1-1.4 1.6-2l.9.6.1.2v.5a5.7 5.7 0 00-.7.8l-.2.6v1.5l.5.9v.3c0 .2 0 .4-.3.5l-1.7.6z"/>
<path id="F" d="M60 350.1h6.3v2.4h-9.4v-15.9h3v13.5zm12.4-9c.7 0 1.4.1 2 .3a4.3 4.3 0 012.5 2.6 6 6 0 01.4 2.7l-.1.3-.2.2h-7.3c0 1.2.4 2 1 2.6a3 3 0 002 .8l1.2-.1.9-.4.6-.3.5-.2h.3l.2.3.8 1-1 .8a5.7 5.7 0 01-2.4 1h-1.3a6 6 0 01-2.1-.4c-.7-.2-1.3-.6-1.8-1.1-.5-.5-.9-1.2-1.2-2a7.3 7.3 0 010-4.7c.2-.7.6-1.3 1-1.8a5 5 0 011.7-1.2c.7-.3 1.5-.4 2.3-.4zm0 2c-.7 0-1.3.2-1.8.6-.4.4-.7 1-.8 1.9h5v-1l-.5-.8a2 2 0 00-.8-.6l-1-.2zm10.8 9.6c-1 0-1.7-.3-2.2-.8-.6-.6-.8-1.3-.8-2.3v-6.3H79l-.4-.1-.1-.5v-1l1.8-.4.6-3c0-.2 0-.3.2-.4H82.9v3.5h3v1.9h-3v6.1c0 .4 0 .6.2.8.2.2.5.3.8.3h.4a2.3 2.3 0 00.5-.3h.4l.2.2.8 1.3c-.4.4-.9.6-1.4.8a5 5 0 01-1.6.2z"/>
<path id="G" d="M184 341l1.3.2 1.2.4h3.2v1l-.1.5-.5.2-1 .1a3.5 3.5 0 01.3 1.3 3.4 3.4 0 01-1.3 2.7l-1.4.7a5.7 5.7 0 01-3 .1c-.3.3-.5.5-.5.8 0 .2 0 .4.3.5l.8.2h1.2a24.2 24.2 0 012.6.3l1.2.5c.3.2.6.4.8.8.2.3.3.8.3 1.3s0 1-.3 1.4c-.3.5-.6 1-1 1.3-.6.4-1.1.7-1.8.9a8.9 8.9 0 01-4.5 0c-.7-.1-1.2-.3-1.6-.6l-1-1-.2-1c0-.6.1-1 .4-1.4.4-.4.8-.6 1.4-.9-.3-.1-.5-.3-.7-.6-.2-.2-.2-.6-.2-1v-.5l.3-.5.5-.5.6-.5a3.3 3.3 0 01-1.8-3 3.4 3.4 0 011.2-2.7c.4-.3 1-.5 1.5-.7a6 6 0 011.8-.2zm3 12c0-.2-.1-.3-.3-.5l-.5-.3a14.7 14.7 0 00-2.8-.3l-1-.1-.8.6c-.2.2-.3.5-.3.8 0 .2 0 .4.2.5 0 .2.2.4.4.5l.9.3h2.5l1-.4c.2 0 .4-.3.5-.4l.1-.6zm-3-6.4l.8-.1.7-.4.3-.6.2-.7c0-.6-.2-1-.5-1.3-.4-.4-.9-.5-1.5-.5-.7 0-1.2.1-1.5.5-.4.3-.5.7-.5 1.3v.7a1.6 1.6 0 001 1l1 .1zm12.3-5.5c.8 0 1.6.1 2.2.4a5 5 0 013 3c.2.7.3 1.5.3 2.4a7 7 0 01-.4 2.4 4.9 4.9 0 01-2.9 3c-.6.3-1.4.4-2.2.4-.9 0-1.7-.1-2.3-.4a5 5 0 01-3-3c-.2-.7-.3-1.5-.3-2.4 0-1 0-1.7.4-2.4.2-.7.6-1.4 1-1.9a5 5 0 011.9-1.1c.6-.3 1.4-.4 2.3-.4zm0 9.5c.9 0 1.6-.3 2-1 .5-.6.7-1.5.7-2.7 0-1.2-.2-2.1-.7-2.8-.4-.6-1.1-1-2-1-1 0-1.7.4-2.2 1-.4.7-.6 1.6-.6 2.8 0 1.2.2 2.1.6 2.8.5.6 1.2 1 2.2 1z"/>
<path id="H" d="M249.8 352.7c-1 0-1.7-.3-2.2-.8-.6-.6-.8-1.3-.8-2.3v-6.3h-1.2l-.3-.1-.2-.5v-1l1.8-.4.6-3c0-.2 0-.3.2-.4H249.5v3.5h3v1.9h-3v6.1c0 .4 0 .6.3.8.1.2.4.3.7.3h.4a2.3 2.3 0 00.5-.3h.4l.2.2.8 1.3c-.4.4-.9.6-1.4.8a5 5 0 01-1.6.2zm9.7-11.6c.8 0 1.6.1 2.2.4a5 5 0 013 3c.2.7.3 1.5.3 2.4a7 7 0 01-.4 2.4 4.9 4.9 0 01-2.9 3c-.6.3-1.4.4-2.2.4-.9 0-1.7-.1-2.3-.4a5 5 0 01-3-3c-.2-.7-.3-1.5-.3-2.4 0-1 0-1.7.4-2.4.2-.7.6-1.4 1-1.9a5 5 0 011.9-1.1c.6-.3 1.4-.4 2.3-.4zm0 9.5c.9 0 1.6-.3 2-1 .5-.6.7-1.5.7-2.7 0-1.2-.2-2.1-.7-2.8-.4-.6-1.1-1-2-1-1 0-1.7.4-2.2 1-.4.7-.6 1.6-.6 2.8 0 1.2.2 2.1.6 2.8.5.6 1.2 1 2.2 1z"/>
<path id="I" d="M312.3 336.6h.4l.2.1.2.2.2.2 8.4 10.6a11 11 0 010-1.4v-9.7h2.5v16h-1.5c-.2 0-.4 0-.6-.2-.2 0-.3-.2-.4-.4l-8.4-10.6a15.3 15.3 0 01.1 1.4v9.7h-2.6v-15.9h1.5zm14.3 14.4a1.7 1.7 0 01.5-1.1 1.7 1.7 0 011.2-.5 1.6 1.6 0 011.2.5 1.7 1.7 0 01.3 1.8c0 .2-.2.4-.3.5l-.6.4-.6.1a1.7 1.7 0 01-1.2-.5c-.1-.1-.3-.3-.3-.5l-.2-.7zm11-4.6v6.1h-3v-6.1l-5.7-9.8h2.6l.6.2.4.5 2.9 5.3a13.3 13.3 0 01.8 1.7 12 12 0 01.7-1.7l2.9-5.3c0-.2.2-.3.4-.5l.6-.2h2.6l-5.8 9.8zm4.7 4.6a1.7 1.7 0 01.5-1.1 1.7 1.7 0 011.2-.5 1.6 1.6 0 011.1.5 1.7 1.7 0 01.4 1.8c0 .2-.2.4-.4.5l-.5.4-.6.1a1.7 1.7 0 01-1.2-.5l-.4-.5-.1-.7z"/>
<path id="J" d="M128 335.7l.6 1.2a4.3 4.3 0 01-.4 3.4c-.4.6-.9 1.2-1.6 1.8l-.8-.5-.2-.2v-.2l.1-.3a6.5 6.5 0 00.6-.9 2.9 2.9 0 00.3-1.2l-.1-.8c0-.3-.2-.6-.4-.9l-.1-.3c0-.2.1-.4.4-.5l1.6-.6zm9.7 7.8l-.2.2h-.3a29 29 0 00-1.6-.6h-1a2 2 0 00-1.2.3 1 1 0 00-.5.9l.3.6c.1.2.3.3.6.4l.9.4a29 29 0 012 .7l.9.6c.3.2.5.5.6.8.2.3.3.7.3 1.2l-.3 1.5-.9 1.2a4 4 0 01-1.4.7 6.1 6.1 0 01-3 .2 6.7 6.7 0 01-2.1-.7l-.8-.6.7-1 .3-.3h.8a10.6 10.6 0 001.3.7l1 .1.8-.1.6-.3.3-.4.1-.5c0-.3 0-.5-.2-.7a2 2 0 00-.6-.4 29.4 29.4 0 01-3-1l-.9-.7-.6-.9c-.2-.3-.2-.8-.2-1.3a3.3 3.3 0 011-2.4 4 4 0 011.4-.7 5.6 5.6 0 014 .1c.6.2 1.1.6 1.5 1l-.6 1z"/>
<path id="K" d="M429.6 335.7l.5 1.2a4.3 4.3 0 01-.3 3.4l-1.6 1.8-.9-.5-.1-.2v-.2-.3a7.8 7.8 0 00.7-.9l.2-.5a2.5 2.5 0 000-1.5l-.5-.9v-.3c0-.2 0-.4.3-.5l1.7-.6zm3.6 0l.6 1.2a4.3 4.3 0 01-.4 3.4c-.3.6-.9 1.2-1.6 1.8l-.8-.5-.2-.2v-.2l.1-.3a7.8 7.8 0 00.6-.9l.2-.5a2.5 2.5 0 000-1.5c0-.3-.2-.6-.4-.9l-.1-.3c0-.2.1-.4.4-.5l1.6-.6z"/>
<path id="L" d="M387.8 336.6v6.3a19.2 19.2 0 01-.3 4h-1.9a41.8 41.8 0 01-.3-4v-6.3h2.5zm-3 14.4a1.7 1.7 0 01.5-1.1 1.7 1.7 0 011.2-.5 1.6 1.6 0 011.2.5 1.7 1.7 0 01.3 1.8c0 .2-.2.4-.3.5l-.5.4-.7.1a1.7 1.7 0 01-1.2-.5l-.3-.5-.2-.7z"/>
<path id="M" d="M16.4 11.3V15H14V4h3.9c.7 0 1.4.2 2 .3l1.3.8c.4.3.6.7.8 1.1l.3 1.4c0 .6-.1 1-.3 1.5a3 3 0 01-.8 1.2c-.4.3-.8.6-1.4.8-.5.2-1.2.2-2 .2h-1.3zm0-1.9h1.4c.6 0 1-.1 1.4-.4.3-.4.4-.8.4-1.4l-.1-.6a1.4 1.4 0 00-1-1H16.5v3.4zM26 11v4h-2.5V4H27c.8 0 1.5.2 2 .3l1.4.7c.4.3.6.6.8 1l.2 1.3-.1 1a3 3 0 01-1.1 1.6l-1 .5.5.3.4.5 2.3 3.8h-2.3c-.4 0-.7-.2-.9-.5l-1.8-3.2-.3-.3a1 1 0 00-.5 0H26zm0-1.8h1l1-.1.5-.4c.2-.1.3-.3.3-.5l.1-.7c0-.5-.1-.9-.4-1.1-.3-.3-.8-.4-1.5-.4h-1v3.2zm14.5-5.1v2H36v2.5h3.4v1.8H36v2.7h4.5V15h-7V4h7zM49 4v2h-4.5v2.7h3.7v2h-3.7V15h-2.6V4h7zM53 15h-2.5V4H53v11zm4.8-5.6L54.4 4h2.9l.2.3L59.7 8v-.2l.2-.1 1.9-3.3c0-.2.3-.3.5-.3h2.4l-3.4 5.2 3.5 5.7h-2.6l-.4-.1a1 1 0 01-.2-.3l-2.2-3.8-.1.3-2 3.5-.3.3-.4.1h-2.4l3.6-5.6z"/>
<path id="N" d="M20 136.3l-.2.3h-.4-.3a67.9 67.9 0 00-1-.5l-.8-.2c-.5 0-.8.1-1 .4a1 1 0 00-.4.8c0 .2 0 .4.2.5.1.2.3.3.6.4l.7.3a19.6 19.6 0 011.8.7l.8.5a2.6 2.6 0 01.8 2c0 .5-.1 1-.3 1.4a3.3 3.3 0 01-2 2l-1.6.2a5.3 5.3 0 01-2.1-.4 6 6 0 01-1-.4 4 4 0 01-.7-.6l.8-1.2s0-.2.2-.2l.3-.1.5.1a32.8 32.8 0 001.1.7l1 .1c.4 0 .7 0 1-.3.3-.2.4-.5.4-1 0-.2 0-.4-.2-.6l-.6-.4a23.2 23.2 0 01-2.6-.9l-.7-.5-.6-1a3.5 3.5 0 010-2.4l.8-1c.3-.3.7-.6 1.2-.8.4-.2 1-.2 1.6-.2a6 6 0 011.9.3 5 5 0 011.4.8l-.6 1.2zm6.6 6.7c.3 0 .6 0 .9-.2.3 0 .5-.2.7-.5l.4-.7.1-1V134h2.6v6.4a5 5 0 01-.4 1.9 4.1 4.1 0 01-2.4 2.4c-.5.2-1.2.3-2 .3-.7 0-1.3 0-1.9-.3-.6-.2-1-.6-1.5-1-.4-.4-.7-.8-.9-1.4-.2-.6-.3-1.2-.3-1.9v-6.4h2.5v6.4c0 .4 0 .8.2 1 0 .4.2.6.4.8.2.3.4.4.7.5l.9.2zm13.4-9v2h-4.5v2.8h3.7v2h-3.7v4.2h-2.6v-11h7zm8.3 0v2h-4.5v2.8h3.8v2h-3.8v4.2h-2.5v-11h7zm4.1 11H50v-11h2.5v11zm4.7-5.6l-3.4-5.3h2.9l.2.3L59 138l.1-.2.1-.1 2-3.3c0-.2.2-.3.4-.3h2.5l-3.5 5.2 3.5 5.7h-2.5l-.4-.1a1 1 0 01-.2-.3l-2.2-3.8-.2.3-2 3.5-.3.3-.3.1h-2.4l3.5-5.6z"/>
<path id="O" d="M20 201.3l-.2.3h-.4-.3a67.9 67.9 0 00-1-.5l-.8-.2c-.5 0-.8.1-1 .4a1 1 0 00-.4.8c0 .2 0 .4.2.5.1.2.3.3.6.4l.7.3a19.6 19.6 0 011.8.7l.8.5a2.6 2.6 0 01.8 2c0 .5-.1 1-.3 1.4a3.3 3.3 0 01-2 2l-1.6.2a5.3 5.3 0 01-2.1-.4 6 6 0 01-1-.4 4 4 0 01-.7-.6l.8-1.2s0-.2.2-.2l.3-.1.5.1a32.8 32.8 0 001.1.7l1 .1c.4 0 .7 0 1-.3.3-.2.4-.5.4-1 0-.2 0-.4-.2-.6l-.6-.4a23.2 23.2 0 01-2.6-.9l-.7-.5-.6-1a3.5 3.5 0 010-2.4l.8-1c.3-.3.7-.6 1.2-.8.4-.2 1-.2 1.6-.2a6 6 0 011.9.3 5 5 0 011.4.8l-.6 1.2zm6.6 6.7c.3 0 .6 0 .9-.2.3 0 .5-.2.7-.5l.4-.7.1-1V199h2.6v6.4a5 5 0 01-.4 1.9 4.1 4.1 0 01-2.4 2.4c-.5.2-1.2.3-2 .3-.7 0-1.3 0-1.9-.3-.6-.2-1-.6-1.5-1-.4-.4-.7-.8-.9-1.4-.2-.6-.3-1.2-.3-1.9v-6.4h2.5v6.4c0 .4 0 .8.2 1 0 .4.2.6.4.8.2.3.4.4.7.5l.9.2zm13.4-9v2h-4.5v2.8h3.7v2h-3.7v4.2h-2.6v-11h7zm8.3 0v2h-4.5v2.8h3.8v2h-3.8v4.2h-2.5v-11h7zm4.1 11H50v-11h2.5v11zm4.7-5.6l-3.4-5.3h2.9l.2.3L59 203l.1-.2.1-.1 2-3.3c0-.2.2-.3.4-.3h2.5l-3.5 5.2 3.5 5.7h-2.5l-.4-.1a1 1 0 01-.2-.3l-2.2-3.8-.2.3-2 3.5-.3.3-.3.1h-2.4l3.5-5.6z"/>
<path id="P" d="M8 264v2H3.4v2.6h3.4v1.8H3.4v2.7H8v1.9H1v-11h7zm4 5.4L8.8 264h2.9l.2.3L14 268v-.2l.2-.1 1.9-3.3c0-.2.3-.3.5-.3H19l-3.4 5.2L19 275h-2.6l-.4-.1a1 1 0 01-.2-.3l-2.2-3.8-.1.3-2 3.5-.3.3-.4.1H8.6l3.5-5.6zm15.2 2.8h.2l.1.1 1 1c-.4.7-1 1-1.6 1.4-.7.3-1.5.4-2.4.4-.8 0-1.5-.1-2.2-.4a4.8 4.8 0 01-2.7-3 6.5 6.5 0 010-4.4 5.2 5.2 0 013-3 6.3 6.3 0 014.5 0 4.8 4.8 0 011.6 1.1l-.9 1.2-.2.1-.3.1H27l-.2-.2a45 45 0 00-1.2-.5 3.5 3.5 0 00-2 .2l-1 .7-.6 1-.2 1.5c0 .6 0 1 .2 1.5l.6 1.1a2.6 2.6 0 002 1l.7-.1a2.6 2.6 0 001.5-.7h.2l.2-.1zm9.5-8.1v2h-4.5v2.5h3.5v1.8h-3.5v2.7h4.5v1.9h-7v-11h7zm4 7.2v3.7h-2.5v-11H42c.8 0 1.4.2 2 .3.6.2 1 .5 1.4.8l.8 1.1.2 1.4c0 .6 0 1-.3 1.5a3 3 0 01-.8 1.2l-1.3.8c-.6.2-1.2.2-2 .2h-1.3zm0-1.9H42c.7 0 1.1-.1 1.4-.4.3-.4.5-.8.5-1.4l-.1-.6a1.4 1.4 0 00-1-1h-2.1v3.4zm15-5.3v2h-3.1v8.9H50v-9H47v-2h8.7zm3.8 10.9h-2.6v-11h2.6v11zm12.8-5.5c0 .8-.1 1.6-.4 2.2a5.3 5.3 0 01-5.3 3.4c-.8 0-1.6-.1-2.3-.4a5.3 5.3 0 01-3.4-5.2c0-.8.2-1.5.5-2.2a5.2 5.2 0 013-3c.6-.2 1.4-.3 2.2-.3a6 6 0 012.4.4 5.4 5.4 0 013.3 5.1zm-2.6 0c0-.5 0-1-.2-1.4a3 3 0 00-.6-1.1c-.3-.3-.6-.6-1-.7a3.4 3.4 0 00-2.6 0c-.4.1-.7.4-1 .7a3 3 0 00-.5 1l-.3 1.5c0 .6.1 1 .3 1.5 0 .4.3.8.6 1.1.2.3.5.5 1 .7l1.2.2c.5 0 1 0 1.3-.2l1-.7c.3-.3.5-.7.6-1.1l.2-1.5zm5.1-5.4h.5l.2.2.2.2 5.2 6.5a13.8 13.8 0 010-1.1V264H83V275h-1.9a1 1 0 01-.4-.4l-5.1-6.5a23.3 23.3 0 010 1v5.9h-2.2v-11h1.3z"/>
<path id="Q" d="M31.8 334.5c0 .8-.1 1.6-.4 2.2a5.1 5.1 0 01-3 2.9c-.6.3-1.4.4-2.3.4H22v-11h4.2c.9 0 1.7.2 2.4.5s1.3.6 1.8 1.1c.5.5.8 1 1.1 1.8.3.6.4 1.3.4 2.1zm-2.6 0c0-.5 0-1-.2-1.4-.1-.5-.3-.8-.6-1.1-.3-.3-.6-.6-1-.7-.3-.2-.8-.3-1.3-.3h-1.7v7h1.7c.5 0 1 0 1.3-.2l1-.7c.3-.3.5-.7.6-1.1.2-.4.2-1 .2-1.5zm14.6 0c0 .8-.1 1.6-.4 2.2a5.3 5.3 0 01-5.3 3.4c-.8 0-1.6-.1-2.3-.4a5.3 5.3 0 01-3.3-5.2c0-.8.1-1.5.4-2.2a5.2 5.2 0 013-3c.6-.2 1.4-.3 2.2-.3a6 6 0 012.4.4 5.4 5.4 0 013.3 5.1zm-2.6 0c0-.5 0-1-.2-1.4a3 3 0 00-.6-1.1c-.3-.3-.6-.6-1-.7a3.4 3.4 0 00-2.6 0l-1 .7a3 3 0 00-.5 1c-.2.5-.2 1-.2 1.5 0 .6 0 1 .2 1.5.1.4.3.8.6 1.1.2.3.6.5 1 .7l1.2.2c.5 0 1 0 1.3-.2l1-.7c.3-.3.5-.7.6-1.1.2-.4.2-1 .2-1.5zm5.2-5.4h.4l.2.2.2.2 5.2 6.5a13.8 13.8 0 010-1.1V329h2.2V340H52.8a1 1 0 01-.4-.4l-5.2-6.5a23.3 23.3 0 010 1v5.9H45v-11h1.4zm17 0v2H59v2.5h3.5v1.8h-3.5v2.7h4.5v1.9h-7v-11h7z"/>
<path id="R" d="M8 69v2H3.4v2.6h3.4v1.8H3.4V78H8v2H1V69h7zm4 5.4L8.8 69h2.9l.2.3L14 73v-.2l.2-.1 1.9-3.3c0-.2.3-.3.5-.3H19l-3.4 5.2L19 80h-2.6l-.4-.1a1 1 0 01-.2-.3l-2.2-3.8-.1.3-2 3.5-.3.3-.4.1H8.6l3.5-5.6zm15.2 2.8h.2l.1.1 1 1c-.4.7-1 1-1.6 1.4-.7.3-1.5.4-2.4.4-.8 0-1.5-.1-2.2-.4a4.8 4.8 0 01-2.7-3 6.5 6.5 0 010-4.4 5.2 5.2 0 013-3 6.3 6.3 0 014.5 0 4.8 4.8 0 011.6 1.1l-.9 1.2-.2.1-.3.1H27l-.2-.2a45 45 0 00-1.2-.5 3.5 3.5 0 00-2 .2l-1 .7-.6 1-.2 1.5c0 .6 0 1 .2 1.5l.6 1.1a2.6 2.6 0 002 1l.7-.1a2.6 2.6 0 001.5-.7h.2l.2-.1zm9.5-8.1v2h-4.5v2.5h3.5v1.8h-3.5V78h4.5v2h-7V69h7zm4 7.2V80h-2.5V69H42c.8 0 1.4.2 2 .3.6.2 1 .5 1.4.8l.8 1.1.2 1.4c0 .6 0 1-.3 1.5a3 3 0 01-.8 1.2l-1.3.8c-.6.2-1.2.2-2 .2h-1.3zm0-1.9H42c.7 0 1.1-.1 1.4-.4.3-.4.5-.8.5-1.4l-.1-.6a1.4 1.4 0 00-1-1h-2.1v3.4zm15-5.3v2h-3.1V80H50v-9H47v-2h8.7zM59.5 80h-2.6V69h2.6v11zm12.8-5.5c0 .8-.1 1.6-.4 2.2a5.3 5.3 0 01-5.3 3.4c-.8 0-1.6-.1-2.3-.4a5.3 5.3 0 01-3.4-5.2c0-.8.2-1.5.5-2.2a5.2 5.2 0 013-3c.6-.2 1.4-.3 2.2-.3a6 6 0 012.4.4 5.4 5.4 0 013.3 5.1zm-2.6 0c0-.5 0-1-.2-1.4a3 3 0 00-.6-1.1c-.3-.3-.6-.6-1-.7a3.4 3.4 0 00-2.6 0c-.4.1-.7.4-1 .7a3 3 0 00-.5 1l-.3 1.5c0 .6.1 1 .3 1.5 0 .4.3.8.6 1.1.2.3.5.5 1 .7l1.2.2c.5 0 1 0 1.3-.2l1-.7c.3-.3.5-.7.6-1.1l.2-1.5zm5.1-5.4h.5l.2.2.2.2 5.2 6.5a13.8 13.8 0 010-1.1V69H83V80h-1.9a1 1 0 01-.4-.4L75.8 73a23.3 23.3 0 010 1V80h-2.2V69h1.3z"/>
</defs>
<g fill="none" fill-rule="evenodd">
<g stroke-linejoin="round" stroke-width="3.8">
<path stroke="#3AC" d="M82.4 46.5v13h-60v12m60-25v13h21.8v12"/>
<path fill="#C3E7F1" stroke="#3AC" d="M6 5h152.7v41.7H6z"/>
<path fill="#F5F5F5" stroke="#B7B7B7" d="M195.8 46.5v25"/>
<path fill="#F5F5F5" stroke="#B7B7B7" d="M168.5 5h54.6v41.7h-54.6z"/>
<path fill="#F5F5F5" stroke="#B7B7B7" d="M261.3 46.5v25"/>
<path fill="#F5F5F5" stroke="#B7B7B7" d="M234 5h54.5v41.7H234z"/>
<path fill="#F5F5F5" stroke="#B7B7B7" d="M377 46.5v25"/>
<path fill="#F5F5F5" stroke="#B7B7B7" d="M299.5 5h153.8v41.7H299.5z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M22.4 113v21.8"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M6 71.5h32.7v41.7H6z"/>
<path stroke="#3AC" d="M104.2 113v12H76.9v9.8m27.3-21.8v12h31.6v9.8"/>
<path fill="#C3E7F1" stroke="#3AC" d="M49.6 71.5h109.1v41.7H49.6z"/>
<path fill="#F5F5F5" stroke="#B7B7B7" d="M195.8 113v21.8"/>
<path fill="#F5F5F5" stroke="#B7B7B7" d="M168.5 71.5h54.6v41.7h-54.6z"/>
<path fill="#F5F5F5" stroke="#B7B7B7" d="M261.3 113v21.8"/>
<path fill="#F5F5F5" stroke="#B7B7B7" d="M234 71.5h54.5v41.7H234z"/>
<path fill="#F5F5F5" stroke="#B7B7B7" d="M377 113v21.8"/>
<path fill="#F5F5F5" stroke="#B7B7B7" d="M299.5 71.5h153.8v41.7H299.5z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M6 134.8h32.7v41.5H6z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M77 176.3v26.2"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M49.6 134.8h54.6v41.5H49.6z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M195.8 176.3v26.2"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M168.5 134.8h54.6v41.5h-54.6z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M261.3 176.3v26.2"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M234 134.8h54.5v41.5H234z"/>
<path stroke="#3AC" d="M377 176.3v14.2h-22v12m22-26.2v14.2h60v12"/>
<path fill="#C3E7F1" stroke="#3AC" d="M299.5 134.8h153.8v41.5H299.5z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M135.8 176.3v26.2"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M114 134.8h43.6v41.5H114z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M22.4 244v25"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M6 202.2h32.7v41.7H6z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M77 244v25"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M49.6 202.2h54.6v41.7H49.6z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M195.8 244v25"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M168.5 202.2h54.6v41.7h-54.6z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M261.3 244v25"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M234 202.2h54.5v41.7H234z"/>
<path stroke="#3AC" d="M355 244v12h-21.7v13m21.8-25v12h37v13"/>
<path fill="#C3E7F1" stroke="#3AC" d="M299.5 202.2h110.1v41.7H299.5z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M135.8 244v25"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M114 202.2h43.6v41.7H114z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M437 244v25"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M420.5 202.2h32.8v41.7h-32.8z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M22.4 310.5v25"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M6 268.7h32.7v41.8H6z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M77 310.5v25"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M49.6 268.7h54.6v41.8H49.6z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M195.8 310.5v21.8-18.6 21.8"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M168.5 268.7h54.6v41.8h-54.6z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M234 268.7h54.5v41.8H234z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M333.3 310.5v25"/>
<path fill="#C3E7F1" stroke="#3AC" d="M299.5 268.7H366v41.8h-66.5z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M135.8 310.5v25"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M114 268.7h43.6v41.8H114z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M437 310.5v25"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M420.5 268.7h32.8v41.8h-32.8z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M392.2 310.5v25"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M375.8 268.7h32.7v41.8h-32.7z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M8 335.5h28.7a2 2 0 012 2V375a2 2 0 01-2 2H8a2 2 0 01-2-2v-37.5c0-1 .9-2 2-2z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M49.6 335.5h54.6V377H49.6z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M168.5 335.5h54.6V377h-54.6z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M234 335.5h54.5V377H234z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M299.5 335.5H366V377h-66.5z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M114 335.5h43.6V377H114z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M420.5 335.5h32.8V377h-32.8z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M375.8 335.5h32.7V377h-32.7z"/>
<path fill="#B5F3D4" stroke="#3AD787" d="M261.3 310.5v25"/>
</g>
<g fill-rule="nonzero">
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#a"/>
<use fill="#1A1E23" xlink:href="#a"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#b"/>
<use fill="#1A1E23" xlink:href="#b"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#c"/>
<use fill="#1A1E23" xlink:href="#c"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#d"/>
<use fill="#1A1E23" xlink:href="#d"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#e"/>
<use fill="#1A1E23" xlink:href="#e"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#f"/>
<use fill="#1A1E23" xlink:href="#f"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#g"/>
<use fill="#1A1E23" xlink:href="#g"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#h"/>
<use fill="#1A1E23" xlink:href="#h"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#i"/>
<use fill="#1A1E23" xlink:href="#i"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#j"/>
<use fill="#1A1E23" xlink:href="#j"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#k"/>
<use fill="#1A1E23" xlink:href="#k"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#l"/>
<use fill="#1A1E23" xlink:href="#l"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#m"/>
<use fill="#1A1E23" xlink:href="#m"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#n"/>
<use fill="#1A1E23" xlink:href="#n"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#o"/>
<use fill="#1A1E23" xlink:href="#o"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#p"/>
<use fill="#1A1E23" xlink:href="#p"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#q"/>
<use fill="#1A1E23" xlink:href="#q"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#r"/>
<use fill="#1A1E23" xlink:href="#r"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#s"/>
<use fill="#1A1E23" xlink:href="#s"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#t"/>
<use fill="#1A1E23" xlink:href="#t"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#u"/>
<use fill="#1A1E23" xlink:href="#u"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#v"/>
<use fill="#1A1E23" xlink:href="#v"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#w"/>
<use fill="#1A1E23" xlink:href="#w"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#x"/>
<use fill="#1A1E23" xlink:href="#x"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#y"/>
<use fill="#1A1E23" xlink:href="#y"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#z"/>
<use fill="#1A1E23" xlink:href="#z"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#A"/>
<use fill="#1A1E23" xlink:href="#A"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#B"/>
<use fill="#1A1E23" xlink:href="#B"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#C"/>
<use fill="#1A1E23" xlink:href="#C"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#D"/>
<use fill="#1A1E23" xlink:href="#D"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#E"/>
<use fill="#1A1E23" xlink:href="#E"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#F"/>
<use fill="#1A1E23" xlink:href="#F"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#G"/>
<use fill="#1A1E23" xlink:href="#G"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#H"/>
<use fill="#1A1E23" xlink:href="#H"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#I"/>
<use fill="#1A1E23" xlink:href="#I"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#J"/>
<use fill="#1A1E23" xlink:href="#J"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#K"/>
<use fill="#1A1E23" xlink:href="#K"/>
</g>
<g transform="translate(6 11)">
<use fill="#3D4251" xlink:href="#L"/>
<use fill="#1A1E23" xlink:href="#L"/>
</g>
</g>
<rect width="101" height="20" x="483" y="16" fill="#3AC" fill-rule="nonzero" stroke="#3AC" stroke-width="2.2" rx="10"/>
<rect width="101" height="20" x="483" y="146" fill="#3AC" fill-rule="nonzero" stroke="#3AC" stroke-width="2.2" rx="10"/>
<rect width="101" height="20" x="483" y="211" fill="#3AC" fill-rule="nonzero" stroke="#3AC" stroke-width="2.2" rx="10"/>
<rect width="101" height="20" x="483" y="276" fill="#3AC" fill-rule="nonzero" stroke="#3AC" stroke-width="2.2" rx="10"/>
<rect width="101" height="20" x="483" y="341" fill="#3AD787" fill-rule="nonzero" stroke="#3AD787" stroke-width="2.2" rx="10"/>
<rect width="101" height="20" x="483" y="81" fill="#3AC" fill-rule="nonzero" stroke="#3AC" stroke-width="2.2" rx="10"/>
<g fill-rule="nonzero">
<g transform="translate(493 16)">
<use fill="#000" xlink:href="#M"/>
<use fill="#FFF" xlink:href="#M"/>
</g>
<g transform="translate(493 16)">
<use fill="#000" xlink:href="#N"/>
<use fill="#FFF" xlink:href="#N"/>
</g>
<g transform="translate(493 16)">
<use fill="#000" xlink:href="#O"/>
<use fill="#FFF" xlink:href="#O"/>
</g>
<g transform="translate(493 16)">
<use fill="#000" xlink:href="#P"/>
<use fill="#FFF" xlink:href="#P"/>
</g>
<g transform="translate(493 16)">
<use fill="#000" xlink:href="#Q"/>
<use fill="#FFF" xlink:href="#Q"/>
</g>
<g transform="translate(493 16)">
<use fill="#000" xlink:href="#R"/>
<use fill="#FFF" xlink:href="#R"/>
</g>
</g>
</g>
</svg>

Before

Width:  |  Height:  |  Size: 12 KiB

After

Width:  |  Height:  |  Size: 45 KiB

Some files were not shown because too many files have changed in this diff Show More