mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-23 23:20:52 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
bbc3e96690
|
@ -6,7 +6,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.0.0a22,<8.0.0a30",
|
||||
"thinc>=8.0.0a23,<8.0.0a30",
|
||||
"blis>=0.4.0,<0.5.0",
|
||||
"pytokenizations",
|
||||
"smart_open>=2.0.0,<3.0.0"
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Our libraries
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a22,<8.0.0a30
|
||||
thinc>=8.0.0a23,<8.0.0a30
|
||||
blis>=0.4.0,<0.5.0
|
||||
ml_datasets>=0.1.1
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.0.0a22,<8.0.0a30
|
||||
thinc>=8.0.0a23,<8.0.0a30
|
||||
install_requires =
|
||||
# Our libraries
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a22,<8.0.0a30
|
||||
thinc>=8.0.0a23,<8.0.0a30
|
||||
blis>=0.4.0,<0.5.0
|
||||
wasabi>=0.7.1,<1.1.0
|
||||
srsly>=2.1.0,<3.0.0
|
||||
|
|
|
@ -17,23 +17,28 @@ from .. import displacy
|
|||
def evaluate_cli(
|
||||
# fmt: off
|
||||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help="Location of JSON-formatted evaluation data", exists=True),
|
||||
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
|
||||
output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
|
||||
gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
||||
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Evaluate a model. To render a sample of parses in a HTML file, set an
|
||||
output directory as the displacy_path argument.
|
||||
Evaluate a model. Expects a loadable spaCy model and evaluation data in the
|
||||
binary .spacy format. The --gold-preproc option sets up the evaluation
|
||||
examples with gold-standard sentences and tokens for the predictions. Gold
|
||||
preprocessing helps the annotations align to the tokenization, and may
|
||||
result in sequences of more consistent length. However, it may reduce
|
||||
runtime accuracy due to train/test skew. To render a sample of dependency
|
||||
parses in a HTML file, set as output directory as the displacy_path argument.
|
||||
"""
|
||||
evaluate(
|
||||
model,
|
||||
data_path,
|
||||
output=output,
|
||||
gpu_id=gpu_id,
|
||||
use_gpu=use_gpu,
|
||||
gold_preproc=gold_preproc,
|
||||
displacy_path=displacy_path,
|
||||
displacy_limit=displacy_limit,
|
||||
|
@ -45,7 +50,7 @@ def evaluate(
|
|||
model: str,
|
||||
data_path: Path,
|
||||
output: Optional[Path] = None,
|
||||
gpu_id: int = -1,
|
||||
use_gpu: int = -1,
|
||||
gold_preproc: bool = False,
|
||||
displacy_path: Optional[Path] = None,
|
||||
displacy_limit: int = 25,
|
||||
|
@ -53,8 +58,8 @@ def evaluate(
|
|||
) -> Scorer:
|
||||
msg = Printer(no_print=silent, pretty=not silent)
|
||||
fix_random_seed()
|
||||
if gpu_id >= 0:
|
||||
require_gpu(gpu_id)
|
||||
if use_gpu >= 0:
|
||||
require_gpu(use_gpu)
|
||||
util.set_env_log(False)
|
||||
data_path = util.ensure_path(data_path)
|
||||
output_path = util.ensure_path(output)
|
||||
|
|
|
@ -19,9 +19,6 @@ after_pipeline_creation = null
|
|||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.Tokenizer.v1"
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[components]
|
||||
|
||||
# Training hyper-parameters and additional features.
|
||||
|
|
|
@ -169,9 +169,9 @@ class Errors:
|
|||
"training a named entity recognizer, also make sure that none of "
|
||||
"your annotated entity spans have leading or trailing whitespace "
|
||||
"or punctuation. "
|
||||
"You can also use the experimental `debug-data` command to "
|
||||
"You can also use the experimental `debug data` command to "
|
||||
"validate your JSON-formatted training data. For details, run:\n"
|
||||
"python -m spacy debug-data --help")
|
||||
"python -m spacy debug data --help")
|
||||
E025 = ("String is too long: {length} characters. Max is 2**30.")
|
||||
E026 = ("Error accessing token at position {i}: out of bounds in Doc of "
|
||||
"length {length}.")
|
||||
|
@ -510,7 +510,7 @@ class Errors:
|
|||
E952 = ("The section '{name}' is not a valid section in the provided config.")
|
||||
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
||||
E954 = ("The Tok2Vec listener did not receive a valid input.")
|
||||
E955 = ("Can't find table '{table}' for language '{lang}' in spacy-lookups-data.")
|
||||
E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.")
|
||||
E956 = ("Can't find component '{name}' in [components] block in the config. "
|
||||
"Available components: {opts}")
|
||||
E957 = ("Writing directly to Language.factories isn't needed anymore in "
|
||||
|
@ -633,6 +633,11 @@ class Errors:
|
|||
E1001 = ("Target token outside of matched span for match with tokens "
|
||||
"'{span}' and offset '{index}' matched by patterns '{patterns}'.")
|
||||
E1002 = ("Span index out of range.")
|
||||
E1003 = ("Unsupported lemmatizer mode '{mode}'.")
|
||||
E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
|
||||
"Required tables '{tables}', found '{found}'. If you are not "
|
||||
"providing custom lookups, make sure you have the package "
|
||||
"spacy-lookups-data installed.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -20,7 +20,7 @@ def create_docbin_reader(
|
|||
|
||||
class Corpus:
|
||||
"""Iterate Example objects from a file or directory of DocBin (.spacy)
|
||||
formated data files.
|
||||
formatted data files.
|
||||
|
||||
path (Path): The directory or filename to read from.
|
||||
gold_preproc (bool): Whether to set up the Example object with gold-standard
|
||||
|
@ -39,7 +39,7 @@ class Corpus:
|
|||
|
||||
def __init__(
|
||||
self,
|
||||
path,
|
||||
path: Union[str, Path],
|
||||
*,
|
||||
limit: int = 0,
|
||||
gold_preproc: bool = False,
|
||||
|
@ -136,8 +136,7 @@ class Corpus:
|
|||
for loc in locs:
|
||||
loc = util.ensure_path(loc)
|
||||
if loc.parts[-1].endswith(".spacy"):
|
||||
with loc.open("rb") as file_:
|
||||
doc_bin = DocBin().from_bytes(file_.read())
|
||||
doc_bin = DocBin().from_disk(loc)
|
||||
docs = doc_bin.get_docs(vocab)
|
||||
for doc in docs:
|
||||
if len(doc):
|
||||
|
|
|
@ -1,38 +1,17 @@
|
|||
from typing import Callable
|
||||
from thinc.api import Config
|
||||
from typing import Optional
|
||||
from thinc.api import Model
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import GreekLemmatizer
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from ...lookups import load_lookups
|
||||
from .lemmatizer import GreekLemmatizer
|
||||
from ...lookups import Lookups
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.el.GreekLemmatizer"
|
||||
"""
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.el.GreekLemmatizer")
|
||||
def create_lemmatizer() -> Callable[[Language], GreekLemmatizer]:
|
||||
tables = ["lemma_index", "lemma_exc", "lemma_rules"]
|
||||
|
||||
def lemmatizer_factory(nlp: Language) -> GreekLemmatizer:
|
||||
lookups = load_lookups(lang=nlp.lang, tables=tables)
|
||||
return GreekLemmatizer(lookups=lookups)
|
||||
|
||||
return lemmatizer_factory
|
||||
|
||||
|
||||
class GreekDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
|
@ -47,4 +26,22 @@ class Greek(Language):
|
|||
Defaults = GreekDefaults
|
||||
|
||||
|
||||
@Greek.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = GreekLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return GreekLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
|
||||
|
||||
__all__ = ["Greek"]
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from typing import Dict, List
|
||||
from typing import List
|
||||
|
||||
from ...lemmatizer import Lemmatizer
|
||||
from ...pipeline import Lemmatizer
|
||||
from ...tokens import Token
|
||||
|
||||
|
||||
class GreekLemmatizer(Lemmatizer):
|
||||
|
@ -14,13 +15,27 @@ class GreekLemmatizer(Lemmatizer):
|
|||
not applicable for Greek language.
|
||||
"""
|
||||
|
||||
def lemmatize(
|
||||
self,
|
||||
string: str,
|
||||
index: Dict[str, List[str]],
|
||||
exceptions: Dict[str, Dict[str, List[str]]],
|
||||
rules: Dict[str, List[List[str]]],
|
||||
) -> List[str]:
|
||||
def rule_lemmatize(self, token: Token) -> List[str]:
|
||||
"""Lemmatize using a rule-based approach.
|
||||
|
||||
token (Token): The token to lemmatize.
|
||||
RETURNS (list): The available lemmas for the string.
|
||||
"""
|
||||
cache_key = (token.lower, token.pos)
|
||||
if cache_key in self.cache:
|
||||
return self.cache[cache_key]
|
||||
string = token.text
|
||||
univ_pos = token.pos_.lower()
|
||||
if univ_pos in ("", "eol", "space"):
|
||||
return [string.lower()]
|
||||
|
||||
index_table = self.lookups.get_table("lemma_index", {})
|
||||
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||
index = index_table.get(univ_pos, {})
|
||||
exceptions = exc_table.get(univ_pos, {})
|
||||
rules = rules_table.get(univ_pos, {})
|
||||
|
||||
string = string.lower()
|
||||
forms = []
|
||||
if string in index:
|
||||
|
@ -42,4 +57,6 @@ class GreekLemmatizer(Lemmatizer):
|
|||
forms.extend(oov_forms)
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
return list(set(forms))
|
||||
forms = list(set(forms))
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
|
|
|
@ -1,39 +1,18 @@
|
|||
from typing import Callable
|
||||
from thinc.api import Config
|
||||
from typing import Optional
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .lemmatizer import is_base_form
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .lemmatizer import EnglishLemmatizer
|
||||
from ...language import Language
|
||||
from ...lemmatizer import Lemmatizer
|
||||
from ...lookups import load_lookups
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.en.EnglishLemmatizer"
|
||||
"""
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.en.EnglishLemmatizer")
|
||||
def create_lemmatizer() -> Callable[[Language], Lemmatizer]:
|
||||
tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
||||
|
||||
def lemmatizer_factory(nlp: Language) -> Lemmatizer:
|
||||
lookups = load_lookups(lang=nlp.lang, tables=tables)
|
||||
return Lemmatizer(lookups=lookups, is_base_form=is_base_form)
|
||||
|
||||
return lemmatizer_factory
|
||||
from ...lookups import Lookups
|
||||
|
||||
|
||||
class EnglishDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
|
@ -46,4 +25,22 @@ class English(Language):
|
|||
Defaults = EnglishDefaults
|
||||
|
||||
|
||||
@English.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = EnglishLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
|
||||
|
||||
__all__ = ["English"]
|
||||
|
|
|
@ -1,36 +1,43 @@
|
|||
from typing import Optional
|
||||
|
||||
from ...pipeline import Lemmatizer
|
||||
from ...tokens import Token
|
||||
|
||||
def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
|
||||
"""
|
||||
Check whether we're dealing with an uninflected paradigm, so we can
|
||||
avoid lemmatization entirely.
|
||||
|
||||
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
||||
morphology (dict): The token's morphological features following the
|
||||
Universal Dependencies scheme.
|
||||
class EnglishLemmatizer(Lemmatizer):
|
||||
"""English lemmatizer. Only overrides is_base_form.
|
||||
"""
|
||||
if morphology is None:
|
||||
morphology = {}
|
||||
if univ_pos == "noun" and morphology.get("Number") == "sing":
|
||||
return True
|
||||
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
|
||||
return True
|
||||
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||
# morphology
|
||||
elif univ_pos == "verb" and (
|
||||
morphology.get("VerbForm") == "fin"
|
||||
and morphology.get("Tense") == "pres"
|
||||
and morphology.get("Number") is None
|
||||
):
|
||||
return True
|
||||
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
|
||||
return True
|
||||
elif morphology.get("VerbForm") == "inf":
|
||||
return True
|
||||
elif morphology.get("VerbForm") == "none":
|
||||
return True
|
||||
elif morphology.get("Degree") == "pos":
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def is_base_form(self, token: Token) -> bool:
|
||||
"""
|
||||
Check whether we're dealing with an uninflected paradigm, so we can
|
||||
avoid lemmatization entirely.
|
||||
|
||||
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
||||
morphology (dict): The token's morphological features following the
|
||||
Universal Dependencies scheme.
|
||||
"""
|
||||
univ_pos = token.pos_.lower()
|
||||
morphology = token.morph.to_dict()
|
||||
if univ_pos == "noun" and morphology.get("Number") == "Sing":
|
||||
return True
|
||||
elif univ_pos == "verb" and morphology.get("VerbForm") == "Inf":
|
||||
return True
|
||||
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||
# morphology
|
||||
elif univ_pos == "verb" and (
|
||||
morphology.get("VerbForm") == "Fin"
|
||||
and morphology.get("Tense") == "Pres"
|
||||
and morphology.get("Number") is None
|
||||
):
|
||||
return True
|
||||
elif univ_pos == "adj" and morphology.get("Degree") == "Pos":
|
||||
return True
|
||||
elif morphology.get("VerbForm") == "Inf":
|
||||
return True
|
||||
elif morphology.get("VerbForm") == "None":
|
||||
return True
|
||||
elif morphology.get("Degree") == "Pos":
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from typing import Callable
|
||||
from thinc.api import Config
|
||||
from typing import Optional
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
|
@ -7,33 +8,12 @@ from .punctuation import TOKENIZER_SUFFIXES
|
|||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .lemmatizer import FrenchLemmatizer, is_base_form
|
||||
from ...lookups import load_lookups
|
||||
from .lemmatizer import FrenchLemmatizer
|
||||
from ...lookups import Lookups
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.fr.FrenchLemmatizer"
|
||||
"""
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.fr.FrenchLemmatizer")
|
||||
def create_lemmatizer() -> Callable[[Language], FrenchLemmatizer]:
|
||||
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
|
||||
|
||||
def lemmatizer_factory(nlp: Language) -> FrenchLemmatizer:
|
||||
lookups = load_lookups(lang=nlp.lang, tables=tables)
|
||||
return FrenchLemmatizer(lookups=lookups, is_base_form=is_base_form)
|
||||
|
||||
return lemmatizer_factory
|
||||
|
||||
|
||||
class FrenchDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
@ -49,4 +29,22 @@ class French(Language):
|
|||
Defaults = FrenchDefaults
|
||||
|
||||
|
||||
@French.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = FrenchLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
|
||||
|
||||
__all__ = ["French"]
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
from typing import Optional, List, Dict
|
||||
from typing import List, Dict
|
||||
|
||||
from ...lemmatizer import Lemmatizer
|
||||
from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
|
||||
from ...symbols import SCONJ, CCONJ
|
||||
from ...pipeline import Lemmatizer
|
||||
from ...tokens import Token
|
||||
|
||||
|
||||
class FrenchLemmatizer(Lemmatizer):
|
||||
|
@ -15,65 +14,55 @@ class FrenchLemmatizer(Lemmatizer):
|
|||
the lookup table.
|
||||
"""
|
||||
|
||||
def __call__(
|
||||
self, string: str, univ_pos: str, morphology: Optional[dict] = None
|
||||
) -> List[str]:
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
if "lemma_rules" not in self.lookups:
|
||||
return [lookup_table.get(string, string)]
|
||||
if univ_pos in (NOUN, "NOUN", "noun"):
|
||||
univ_pos = "noun"
|
||||
elif univ_pos in (VERB, "VERB", "verb"):
|
||||
univ_pos = "verb"
|
||||
elif univ_pos in (ADJ, "ADJ", "adj"):
|
||||
univ_pos = "adj"
|
||||
elif univ_pos in (ADP, "ADP", "adp"):
|
||||
univ_pos = "adp"
|
||||
elif univ_pos in (ADV, "ADV", "adv"):
|
||||
univ_pos = "adv"
|
||||
elif univ_pos in (AUX, "AUX", "aux"):
|
||||
univ_pos = "aux"
|
||||
elif univ_pos in (CCONJ, "CCONJ", "cconj"):
|
||||
univ_pos = "cconj"
|
||||
elif univ_pos in (DET, "DET", "det"):
|
||||
univ_pos = "det"
|
||||
elif univ_pos in (PRON, "PRON", "pron"):
|
||||
univ_pos = "pron"
|
||||
elif univ_pos in (PUNCT, "PUNCT", "punct"):
|
||||
univ_pos = "punct"
|
||||
elif univ_pos in (SCONJ, "SCONJ", "sconj"):
|
||||
univ_pos = "sconj"
|
||||
@classmethod
|
||||
def get_lookups_config(cls, mode: str) -> Dict:
|
||||
if mode == "rule":
|
||||
return {
|
||||
"required_tables": [
|
||||
"lemma_lookup",
|
||||
"lemma_rules",
|
||||
"lemma_exc",
|
||||
"lemma_index",
|
||||
],
|
||||
"optional_tables": [],
|
||||
}
|
||||
else:
|
||||
return [self.lookup(string)]
|
||||
return super().get_lookups_config(mode)
|
||||
|
||||
def rule_lemmatize(self, token: Token) -> List[str]:
|
||||
cache_key = (token.orth, token.pos)
|
||||
if cache_key in self.cache:
|
||||
return self.cache[cache_key]
|
||||
string = token.text
|
||||
univ_pos = token.pos_.lower()
|
||||
if univ_pos in ("", "eol", "space"):
|
||||
return [string.lower()]
|
||||
elif "lemma_rules" not in self.lookups or univ_pos not in (
|
||||
"noun",
|
||||
"verb",
|
||||
"adj",
|
||||
"adp",
|
||||
"adv",
|
||||
"aux",
|
||||
"cconj",
|
||||
"det",
|
||||
"pron",
|
||||
"punct",
|
||||
"sconj",
|
||||
):
|
||||
return self.lookup_lemmatize(token)
|
||||
index_table = self.lookups.get_table("lemma_index", {})
|
||||
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||
lemmas = self.lemmatize(
|
||||
string,
|
||||
index_table.get(univ_pos, {}),
|
||||
exc_table.get(univ_pos, {}),
|
||||
rules_table.get(univ_pos, []),
|
||||
)
|
||||
return lemmas
|
||||
|
||||
def lookup(self, string: str, orth: Optional[int] = None) -> str:
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
if orth is not None and orth in lookup_table:
|
||||
return lookup_table[orth][0]
|
||||
return string
|
||||
|
||||
def lemmatize(
|
||||
self,
|
||||
string: str,
|
||||
index: Dict[str, List[str]],
|
||||
exceptions: Dict[str, Dict[str, List[str]]],
|
||||
rules: Dict[str, List[List[str]]],
|
||||
) -> List[str]:
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
index = index_table.get(univ_pos, {})
|
||||
exceptions = exc_table.get(univ_pos, {})
|
||||
rules = rules_table.get(univ_pos, [])
|
||||
string = string.lower()
|
||||
forms = []
|
||||
if string in index:
|
||||
forms.append(string)
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
forms.extend(exceptions.get(string, []))
|
||||
oov_forms = []
|
||||
|
@ -90,45 +79,9 @@ class FrenchLemmatizer(Lemmatizer):
|
|||
if not forms:
|
||||
forms.extend(oov_forms)
|
||||
if not forms and string in lookup_table.keys():
|
||||
forms.append(lookup_table[string][0])
|
||||
forms.append(self.lookup_lemmatize(token)[0])
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
return list(set(forms))
|
||||
|
||||
|
||||
def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
|
||||
"""
|
||||
Check whether we're dealing with an uninflected paradigm, so we can
|
||||
avoid lemmatization entirely.
|
||||
"""
|
||||
morphology = {} if morphology is None else morphology
|
||||
others = [
|
||||
key
|
||||
for key in morphology
|
||||
if key not in (POS, "Number", "POS", "VerbForm", "Tense")
|
||||
]
|
||||
if univ_pos == "noun" and morphology.get("Number") == "sing":
|
||||
return True
|
||||
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
|
||||
return True
|
||||
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||
# morphology
|
||||
elif univ_pos == "verb" and (
|
||||
morphology.get("VerbForm") == "fin"
|
||||
and morphology.get("Tense") == "pres"
|
||||
and morphology.get("Number") is None
|
||||
and not others
|
||||
):
|
||||
return True
|
||||
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
|
||||
return True
|
||||
elif "VerbForm=inf" in morphology:
|
||||
return True
|
||||
elif "VerbForm=none" in morphology:
|
||||
return True
|
||||
elif "Number=sing" in morphology:
|
||||
return True
|
||||
elif "Degree=pos" in morphology:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
forms = list(set(forms))
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
|
|
|
@ -38,8 +38,6 @@ def create_tokenizer(split_mode: Optional[str] = None):
|
|||
class JapaneseTokenizer(DummyTokenizer):
|
||||
def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
|
||||
self.vocab = nlp.vocab
|
||||
# TODO: is this the right way to do it?
|
||||
self.vocab.morphology.load_tag_map(TAG_MAP)
|
||||
self.split_mode = split_mode
|
||||
self.tokenizer = try_sudachi_import(self.split_mode)
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@ from .lex_attrs import LEX_ATTRS
|
|||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
from ...compat import copy_reg
|
||||
from ...symbols import POS
|
||||
from ...util import DummyTokenizer, registry
|
||||
|
||||
|
||||
|
@ -29,8 +30,6 @@ def create_tokenizer():
|
|||
class KoreanTokenizer(DummyTokenizer):
|
||||
def __init__(self, nlp: Optional[Language] = None):
|
||||
self.vocab = nlp.vocab
|
||||
# TODO: is this the right way to do it?
|
||||
self.vocab.morphology.load_tag_map(TAG_MAP)
|
||||
MeCab = try_mecab_import()
|
||||
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
|
||||
|
||||
|
@ -44,6 +43,7 @@ class KoreanTokenizer(DummyTokenizer):
|
|||
for token, dtoken in zip(doc, dtokens):
|
||||
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
|
||||
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
|
||||
token.pos = TAG_MAP[token.tag_][POS]
|
||||
token.lemma_ = dtoken["lemma"]
|
||||
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
|
||||
return doc
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from typing import Callable
|
||||
from thinc.api import Config
|
||||
from typing import Optional
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
@ -7,32 +8,11 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .lemmatizer import DutchLemmatizer
|
||||
from ...lookups import load_lookups
|
||||
from ...lookups import Lookups
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.nl.DutchLemmatizer"
|
||||
"""
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.nl.DutchLemmatizer")
|
||||
def create_lemmatizer() -> Callable[[Language], DutchLemmatizer]:
|
||||
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
|
||||
|
||||
def lemmatizer_factory(nlp: Language) -> DutchLemmatizer:
|
||||
lookups = load_lookups(lang=nlp.lang, tables=tables)
|
||||
return DutchLemmatizer(lookups=lookups)
|
||||
|
||||
return lemmatizer_factory
|
||||
|
||||
|
||||
class DutchDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
@ -46,4 +26,22 @@ class Dutch(Language):
|
|||
Defaults = DutchDefaults
|
||||
|
||||
|
||||
@Dutch.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = DutchLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return DutchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
|
||||
|
||||
__all__ = ["Dutch"]
|
||||
|
|
|
@ -1,44 +1,34 @@
|
|||
from typing import Optional, List, Dict, Tuple
|
||||
from typing import List, Dict
|
||||
|
||||
from ...lemmatizer import Lemmatizer
|
||||
from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
|
||||
from ...pipeline import Lemmatizer
|
||||
from ...tokens import Token
|
||||
|
||||
|
||||
class DutchLemmatizer(Lemmatizer):
|
||||
# Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
|
||||
univ_pos_name_variants = {
|
||||
NOUN: "noun",
|
||||
"NOUN": "noun",
|
||||
"noun": "noun",
|
||||
VERB: "verb",
|
||||
"VERB": "verb",
|
||||
"verb": "verb",
|
||||
AUX: "verb",
|
||||
"AUX": "verb",
|
||||
"aux": "verb",
|
||||
ADJ: "adj",
|
||||
"ADJ": "adj",
|
||||
"adj": "adj",
|
||||
ADV: "adv",
|
||||
"ADV": "adv",
|
||||
"adv": "adv",
|
||||
PRON: "pron",
|
||||
"PRON": "pron",
|
||||
"pron": "pron",
|
||||
DET: "det",
|
||||
"DET": "det",
|
||||
"det": "det",
|
||||
ADP: "adp",
|
||||
"ADP": "adp",
|
||||
"adp": "adp",
|
||||
NUM: "num",
|
||||
"NUM": "num",
|
||||
"num": "num",
|
||||
}
|
||||
@classmethod
|
||||
def get_lookups_config(cls, mode: str) -> Dict:
|
||||
if mode == "rule":
|
||||
return {
|
||||
"required_tables": [
|
||||
"lemma_lookup",
|
||||
"lemma_rules",
|
||||
"lemma_exc",
|
||||
"lemma_index",
|
||||
],
|
||||
}
|
||||
else:
|
||||
return super().get_lookups_config(mode)
|
||||
|
||||
def __call__(
|
||||
self, string: str, univ_pos: str, morphology: Optional[dict] = None
|
||||
) -> List[str]:
|
||||
def lookup_lemmatize(self, token: Token) -> List[str]:
|
||||
"""Overrides parent method so that a lowercased version of the string
|
||||
is used to search the lookup table. This is necessary because our
|
||||
lookup table consists entirely of lowercase keys."""
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
string = token.text.lower()
|
||||
return [lookup_table.get(string, string)]
|
||||
|
||||
# Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
|
||||
def rule_lemmatize(self, token: Token) -> List[str]:
|
||||
# Difference 1: self.rules is assumed to be non-None, so no
|
||||
# 'is None' check required.
|
||||
# String lowercased from the get-go. All lemmatization results in
|
||||
|
@ -46,74 +36,61 @@ class DutchLemmatizer(Lemmatizer):
|
|||
# any problems, and it keeps the exceptions indexes small. If this
|
||||
# creates problems for proper nouns, we can introduce a check for
|
||||
# univ_pos == "PROPN".
|
||||
string = string.lower()
|
||||
try:
|
||||
univ_pos = self.univ_pos_name_variants[univ_pos]
|
||||
except KeyError:
|
||||
# Because PROPN not in self.univ_pos_name_variants, proper names
|
||||
# are not lemmatized. They are lowercased, however.
|
||||
return [string]
|
||||
# if string in self.lemma_index.get(univ_pos)
|
||||
cache_key = (token.lower, token.pos)
|
||||
if cache_key in self.cache:
|
||||
return self.cache[cache_key]
|
||||
string = token.text
|
||||
univ_pos = token.pos_.lower()
|
||||
if univ_pos in ("", "eol", "space"):
|
||||
forms = [string.lower()]
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
|
||||
index_table = self.lookups.get_table("lemma_index", {})
|
||||
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||
index = index_table.get(univ_pos, {})
|
||||
exceptions = exc_table.get(univ_pos, {})
|
||||
rules = rules_table.get(univ_pos, {})
|
||||
|
||||
string = string.lower()
|
||||
if univ_pos not in (
|
||||
"noun",
|
||||
"verb",
|
||||
"aux",
|
||||
"adj",
|
||||
"adv",
|
||||
"pron",
|
||||
"det",
|
||||
"adp",
|
||||
"num",
|
||||
):
|
||||
forms = [string]
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
lemma_index = index_table.get(univ_pos, {})
|
||||
# string is already lemma
|
||||
if string in lemma_index:
|
||||
return [string]
|
||||
forms = [string]
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||
exceptions = exc_table.get(univ_pos, {})
|
||||
# string is irregular token contained in exceptions index.
|
||||
try:
|
||||
lemma = exceptions[string]
|
||||
return [lemma[0]]
|
||||
forms = [exceptions[string][0]]
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
except KeyError:
|
||||
pass
|
||||
# string corresponds to key in lookup table
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
looked_up_lemma = lookup_table.get(string)
|
||||
if looked_up_lemma and looked_up_lemma in lemma_index:
|
||||
return [looked_up_lemma]
|
||||
forms = [looked_up_lemma]
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||
forms, is_known = self.lemmatize(
|
||||
string, lemma_index, exceptions, rules_table.get(univ_pos, [])
|
||||
)
|
||||
# Back-off through remaining return value candidates.
|
||||
if forms:
|
||||
if is_known:
|
||||
return forms
|
||||
else:
|
||||
for form in forms:
|
||||
if form in exceptions:
|
||||
return [form]
|
||||
if looked_up_lemma:
|
||||
return [looked_up_lemma]
|
||||
else:
|
||||
return forms
|
||||
elif looked_up_lemma:
|
||||
return [looked_up_lemma]
|
||||
else:
|
||||
return [string]
|
||||
|
||||
# Overrides parent method so that a lowercased version of the string is
|
||||
# used to search the lookup table. This is necessary because our lookup
|
||||
# table consists entirely of lowercase keys.
|
||||
def lookup(self, string: str, orth: Optional[int] = None) -> str:
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
string = string.lower()
|
||||
if orth is not None:
|
||||
return lookup_table.get(orth, string)
|
||||
else:
|
||||
return lookup_table.get(string, string)
|
||||
|
||||
# Reimplemented to focus more on application of suffix rules and to return
|
||||
# as early as possible.
|
||||
def lemmatize(
|
||||
self,
|
||||
string: str,
|
||||
index: Dict[str, List[str]],
|
||||
exceptions: Dict[str, Dict[str, List[str]]],
|
||||
rules: Dict[str, List[List[str]]],
|
||||
) -> Tuple[List[str], bool]:
|
||||
# returns (forms, is_known: bool)
|
||||
oov_forms = []
|
||||
for old, new in rules:
|
||||
if string.endswith(old):
|
||||
|
@ -121,7 +98,31 @@ class DutchLemmatizer(Lemmatizer):
|
|||
if not form:
|
||||
pass
|
||||
elif form in index:
|
||||
return [form], True # True = Is known (is lemma)
|
||||
forms = [form]
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
else:
|
||||
oov_forms.append(form)
|
||||
return list(set(oov_forms)), False
|
||||
forms = list(set(oov_forms))
|
||||
# Back-off through remaining return value candidates.
|
||||
if forms:
|
||||
for form in forms:
|
||||
if form in exceptions:
|
||||
forms = [form]
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
if looked_up_lemma:
|
||||
forms = [looked_up_lemma]
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
else:
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
elif looked_up_lemma:
|
||||
forms = [looked_up_lemma]
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
else:
|
||||
forms = [string]
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from typing import Callable
|
||||
from thinc.api import Config
|
||||
from typing import Optional
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
|
@ -7,42 +8,16 @@ from .stop_words import STOP_WORDS
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import PolishLemmatizer
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...lookups import load_lookups
|
||||
from ...lookups import Lookups
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.pl.PolishLemmatizer"
|
||||
"""
|
||||
|
||||
TOKENIZER_EXCEPTIONS = {
|
||||
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
|
||||
}
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.pl.PolishLemmatizer")
|
||||
def create_lemmatizer() -> Callable[[Language], PolishLemmatizer]:
|
||||
# fmt: off
|
||||
tables = [
|
||||
"lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv",
|
||||
"lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num",
|
||||
"lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
def lemmatizer_factory(nlp: Language) -> PolishLemmatizer:
|
||||
lookups = load_lookups(lang=nlp.lang, tables=tables)
|
||||
return PolishLemmatizer(lookups=lookups)
|
||||
|
||||
return lemmatizer_factory
|
||||
|
||||
|
||||
class PolishDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
@ -56,4 +31,22 @@ class Polish(Language):
|
|||
Defaults = PolishDefaults
|
||||
|
||||
|
||||
@Polish.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "lookup", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = PolishLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return PolishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
|
||||
|
||||
__all__ = ["Polish"]
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from typing import Optional, List, Dict
|
||||
from typing import List, Dict
|
||||
|
||||
from ...lemmatizer import Lemmatizer
|
||||
from ...parts_of_speech import NAMES
|
||||
from ...pipeline import Lemmatizer
|
||||
from ...tokens import Token
|
||||
|
||||
|
||||
class PolishLemmatizer(Lemmatizer):
|
||||
|
@ -9,12 +9,30 @@ class PolishLemmatizer(Lemmatizer):
|
|||
# dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS.
|
||||
# It utilizes some prefix based improvements for verb and adjectives
|
||||
# lemmatization, as well as case-sensitive lemmatization for nouns.
|
||||
def __call__(
|
||||
self, string: str, univ_pos: str, morphology: Optional[dict] = None
|
||||
) -> List[str]:
|
||||
if isinstance(univ_pos, int):
|
||||
univ_pos = NAMES.get(univ_pos, "X")
|
||||
univ_pos = univ_pos.upper()
|
||||
|
||||
@classmethod
|
||||
def get_lookups_config(cls, mode: str) -> Dict:
|
||||
if mode == "lookup":
|
||||
return {
|
||||
"required_tables": [
|
||||
"lemma_lookup_adj",
|
||||
"lemma_lookup_adp",
|
||||
"lemma_lookup_adv",
|
||||
"lemma_lookup_aux",
|
||||
"lemma_lookup_noun",
|
||||
"lemma_lookup_num",
|
||||
"lemma_lookup_part",
|
||||
"lemma_lookup_pron",
|
||||
"lemma_lookup_verb",
|
||||
]
|
||||
}
|
||||
else:
|
||||
return super().get_lookups_config(mode)
|
||||
|
||||
def lookup_lemmatize(self, token: Token) -> List[str]:
|
||||
string = token.text
|
||||
univ_pos = token.pos_
|
||||
morphology = token.morph.to_dict()
|
||||
lookup_pos = univ_pos.lower()
|
||||
if univ_pos == "PROPN":
|
||||
lookup_pos = "noun"
|
||||
|
@ -71,15 +89,3 @@ class PolishLemmatizer(Lemmatizer):
|
|||
return [lookup_table[string]]
|
||||
return [string.lower()]
|
||||
return [lookup_table.get(string, string)]
|
||||
|
||||
def lookup(self, string: str, orth: Optional[int] = None) -> str:
|
||||
return string.lower()
|
||||
|
||||
def lemmatize(
|
||||
self,
|
||||
string: str,
|
||||
index: Dict[str, List[str]],
|
||||
exceptions: Dict[str, Dict[str, List[str]]],
|
||||
rules: Dict[str, List[List[str]]],
|
||||
) -> List[str]:
|
||||
raise NotImplementedError
|
||||
|
|
|
@ -1,32 +1,16 @@
|
|||
from typing import Callable
|
||||
from thinc.api import Config
|
||||
from typing import Optional
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import RussianLemmatizer
|
||||
from ...util import registry
|
||||
from ...language import Language
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.ru.RussianLemmatizer"
|
||||
"""
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.ru.RussianLemmatizer")
|
||||
def create_lemmatizer() -> Callable[[Language], RussianLemmatizer]:
|
||||
def lemmatizer_factory(nlp: Language) -> RussianLemmatizer:
|
||||
return RussianLemmatizer()
|
||||
|
||||
return lemmatizer_factory
|
||||
from ...lookups import Lookups
|
||||
|
||||
|
||||
class RussianDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
@ -37,4 +21,21 @@ class Russian(Language):
|
|||
Defaults = RussianDefaults
|
||||
|
||||
|
||||
@Russian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
return RussianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
|
||||
|
||||
__all__ = ["Russian"]
|
||||
|
|
|
@ -1,8 +1,12 @@
|
|||
from typing import Optional, Tuple, Dict, List
|
||||
from typing import Optional, List, Dict, Tuple
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
|
||||
from ...lemmatizer import Lemmatizer
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
from ...symbols import POS
|
||||
from ...tokens import Token
|
||||
from ...vocab import Vocab
|
||||
|
||||
|
||||
PUNCT_RULES = {"«": '"', "»": '"'}
|
||||
|
@ -11,8 +15,17 @@ PUNCT_RULES = {"«": '"', "»": '"'}
|
|||
class RussianLemmatizer(Lemmatizer):
|
||||
_morph = None
|
||||
|
||||
def __init__(self, lookups: Optional[Lookups] = None) -> None:
|
||||
super(RussianLemmatizer, self).__init__(lookups)
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Vocab,
|
||||
model: Optional[Model],
|
||||
name: str = "lemmatizer",
|
||||
*,
|
||||
mode: str = "pymorphy2",
|
||||
lookups: Optional[Lookups] = None,
|
||||
) -> None:
|
||||
super().__init__(vocab, model, name, mode=mode, lookups=lookups)
|
||||
|
||||
try:
|
||||
from pymorphy2 import MorphAnalyzer
|
||||
except ImportError:
|
||||
|
@ -25,10 +38,10 @@ class RussianLemmatizer(Lemmatizer):
|
|||
if RussianLemmatizer._morph is None:
|
||||
RussianLemmatizer._morph = MorphAnalyzer()
|
||||
|
||||
def __call__(
|
||||
self, string: str, univ_pos: str, morphology: Optional[dict] = None
|
||||
) -> List[str]:
|
||||
univ_pos = self.normalize_univ_pos(univ_pos)
|
||||
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
||||
string = token.text
|
||||
univ_pos = token.pos_
|
||||
morphology = token.morph.to_dict()
|
||||
if univ_pos == "PUNCT":
|
||||
return [PUNCT_RULES.get(string, string)]
|
||||
if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
|
||||
|
@ -81,25 +94,8 @@ class RussianLemmatizer(Lemmatizer):
|
|||
return [string.lower()]
|
||||
return list(set([analysis.normal_form for analysis in filtered_analyses]))
|
||||
|
||||
@staticmethod
|
||||
def normalize_univ_pos(univ_pos: str) -> Optional[str]:
|
||||
if isinstance(univ_pos, str):
|
||||
return univ_pos.upper()
|
||||
symbols_to_str = {
|
||||
ADJ: "ADJ",
|
||||
DET: "DET",
|
||||
NOUN: "NOUN",
|
||||
NUM: "NUM",
|
||||
PRON: "PRON",
|
||||
PROPN: "PROPN",
|
||||
PUNCT: "PUNCT",
|
||||
VERB: "VERB",
|
||||
}
|
||||
if univ_pos in symbols_to_str:
|
||||
return symbols_to_str[univ_pos]
|
||||
return None
|
||||
|
||||
def lookup(self, string: str, orth: Optional[int] = None) -> str:
|
||||
def lookup_lemmatize(self, token: Token) -> List[str]:
|
||||
string = token.text
|
||||
analyses = self._morph.parse(string)
|
||||
if len(analyses) == 1:
|
||||
return analyses[0].normal_form
|
||||
|
|
|
@ -1,32 +1,16 @@
|
|||
from typing import Callable
|
||||
from thinc.api import Config
|
||||
from typing import Optional
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...util import registry
|
||||
from ...language import Language
|
||||
from .lemmatizer import UkrainianLemmatizer
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.uk.UkrainianLemmatizer"
|
||||
"""
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.uk.UkrainianLemmatizer")
|
||||
def create_ukrainian_lemmatizer() -> Callable[[Language], UkrainianLemmatizer]:
|
||||
def lemmatizer_factory(nlp: Language) -> UkrainianLemmatizer:
|
||||
return UkrainianLemmatizer()
|
||||
|
||||
return lemmatizer_factory
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
|
||||
|
||||
class UkrainianDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
@ -37,4 +21,21 @@ class Ukrainian(Language):
|
|||
Defaults = UkrainianDefaults
|
||||
|
||||
|
||||
@Ukrainian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
|
||||
|
||||
__all__ = ["Ukrainian"]
|
||||
|
|
|
@ -1,187 +1,30 @@
|
|||
from typing import Optional, List, Tuple, Dict
|
||||
from typing import Optional
|
||||
|
||||
from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
|
||||
from thinc.api import Model
|
||||
|
||||
from ..ru.lemmatizer import RussianLemmatizer
|
||||
from ...lookups import Lookups
|
||||
from ...lemmatizer import Lemmatizer
|
||||
from ...vocab import Vocab
|
||||
|
||||
|
||||
PUNCT_RULES = {"«": '"', "»": '"'}
|
||||
|
||||
|
||||
class UkrainianLemmatizer(Lemmatizer):
|
||||
_morph = None
|
||||
|
||||
def __init__(self, lookups: Optional[Lookups] = None) -> None:
|
||||
super(UkrainianLemmatizer, self).__init__(lookups)
|
||||
class UkrainianLemmatizer(RussianLemmatizer):
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Vocab,
|
||||
model: Optional[Model],
|
||||
name: str = "lemmatizer",
|
||||
*,
|
||||
mode: str = "pymorphy2",
|
||||
lookups: Optional[Lookups] = None,
|
||||
) -> None:
|
||||
super().__init__(vocab, model, name, mode=mode, lookups=lookups)
|
||||
try:
|
||||
from pymorphy2 import MorphAnalyzer
|
||||
|
||||
if UkrainianLemmatizer._morph is None:
|
||||
UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
|
||||
except (ImportError, TypeError):
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"The Ukrainian lemmatizer requires the pymorphy2 library and "
|
||||
'dictionaries: try to fix it with "pip uninstall pymorphy2" and'
|
||||
'"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
|
||||
) from None
|
||||
|
||||
def __call__(
|
||||
self, string: str, univ_pos: str, morphology: Optional[dict] = None
|
||||
) -> List[str]:
|
||||
univ_pos = self.normalize_univ_pos(univ_pos)
|
||||
if univ_pos == "PUNCT":
|
||||
return [PUNCT_RULES.get(string, string)]
|
||||
if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
|
||||
# Skip unchangeable pos
|
||||
return [string.lower()]
|
||||
analyses = self._morph.parse(string)
|
||||
filtered_analyses = []
|
||||
for analysis in analyses:
|
||||
if not analysis.is_known:
|
||||
# Skip suggested parse variant for unknown word for pymorphy
|
||||
continue
|
||||
analysis_pos, _ = oc2ud(str(analysis.tag))
|
||||
if analysis_pos == univ_pos or (
|
||||
analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
|
||||
):
|
||||
filtered_analyses.append(analysis)
|
||||
if not len(filtered_analyses):
|
||||
return [string.lower()]
|
||||
if morphology is None or (len(morphology) == 1 and POS in morphology):
|
||||
return list(set([analysis.normal_form for analysis in filtered_analyses]))
|
||||
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
|
||||
features_to_compare = ["Case", "Number", "Gender"]
|
||||
elif univ_pos == "NUM":
|
||||
features_to_compare = ["Case", "Gender"]
|
||||
elif univ_pos == "PRON":
|
||||
features_to_compare = ["Case", "Number", "Gender", "Person"]
|
||||
else: # VERB
|
||||
features_to_compare = [
|
||||
"Aspect",
|
||||
"Gender",
|
||||
"Mood",
|
||||
"Number",
|
||||
"Tense",
|
||||
"VerbForm",
|
||||
"Voice",
|
||||
]
|
||||
analyses, filtered_analyses = filtered_analyses, []
|
||||
for analysis in analyses:
|
||||
_, analysis_morph = oc2ud(str(analysis.tag))
|
||||
for feature in features_to_compare:
|
||||
if (
|
||||
feature in morphology
|
||||
and feature in analysis_morph
|
||||
and morphology[feature].lower() != analysis_morph[feature].lower()
|
||||
):
|
||||
break
|
||||
else:
|
||||
filtered_analyses.append(analysis)
|
||||
if not len(filtered_analyses):
|
||||
return [string.lower()]
|
||||
return list(set([analysis.normal_form for analysis in filtered_analyses]))
|
||||
|
||||
@staticmethod
|
||||
def normalize_univ_pos(univ_pos: str) -> Optional[str]:
|
||||
if isinstance(univ_pos, str):
|
||||
return univ_pos.upper()
|
||||
symbols_to_str = {
|
||||
ADJ: "ADJ",
|
||||
DET: "DET",
|
||||
NOUN: "NOUN",
|
||||
NUM: "NUM",
|
||||
PRON: "PRON",
|
||||
PROPN: "PROPN",
|
||||
PUNCT: "PUNCT",
|
||||
VERB: "VERB",
|
||||
}
|
||||
if univ_pos in symbols_to_str:
|
||||
return symbols_to_str[univ_pos]
|
||||
return None
|
||||
|
||||
def lookup(self, string: str, orth: Optional[int] = None) -> str:
|
||||
analyses = self._morph.parse(string)
|
||||
if len(analyses) == 1:
|
||||
return analyses[0].normal_form
|
||||
return string
|
||||
|
||||
|
||||
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
|
||||
gram_map = {
|
||||
"_POS": {
|
||||
"ADJF": "ADJ",
|
||||
"ADJS": "ADJ",
|
||||
"ADVB": "ADV",
|
||||
"Apro": "DET",
|
||||
"COMP": "ADJ", # Can also be an ADV - unchangeable
|
||||
"CONJ": "CCONJ", # Can also be a SCONJ - both unchangeable ones
|
||||
"GRND": "VERB",
|
||||
"INFN": "VERB",
|
||||
"INTJ": "INTJ",
|
||||
"NOUN": "NOUN",
|
||||
"NPRO": "PRON",
|
||||
"NUMR": "NUM",
|
||||
"NUMB": "NUM",
|
||||
"PNCT": "PUNCT",
|
||||
"PRCL": "PART",
|
||||
"PREP": "ADP",
|
||||
"PRTF": "VERB",
|
||||
"PRTS": "VERB",
|
||||
"VERB": "VERB",
|
||||
},
|
||||
"Animacy": {"anim": "Anim", "inan": "Inan"},
|
||||
"Aspect": {"impf": "Imp", "perf": "Perf"},
|
||||
"Case": {
|
||||
"ablt": "Ins",
|
||||
"accs": "Acc",
|
||||
"datv": "Dat",
|
||||
"gen1": "Gen",
|
||||
"gen2": "Gen",
|
||||
"gent": "Gen",
|
||||
"loc2": "Loc",
|
||||
"loct": "Loc",
|
||||
"nomn": "Nom",
|
||||
"voct": "Voc",
|
||||
},
|
||||
"Degree": {"COMP": "Cmp", "Supr": "Sup"},
|
||||
"Gender": {"femn": "Fem", "masc": "Masc", "neut": "Neut"},
|
||||
"Mood": {"impr": "Imp", "indc": "Ind"},
|
||||
"Number": {"plur": "Plur", "sing": "Sing"},
|
||||
"NumForm": {"NUMB": "Digit"},
|
||||
"Person": {"1per": "1", "2per": "2", "3per": "3", "excl": "2", "incl": "1"},
|
||||
"Tense": {"futr": "Fut", "past": "Past", "pres": "Pres"},
|
||||
"Variant": {"ADJS": "Brev", "PRTS": "Brev"},
|
||||
"VerbForm": {
|
||||
"GRND": "Conv",
|
||||
"INFN": "Inf",
|
||||
"PRTF": "Part",
|
||||
"PRTS": "Part",
|
||||
"VERB": "Fin",
|
||||
},
|
||||
"Voice": {"actv": "Act", "pssv": "Pass"},
|
||||
"Abbr": {"Abbr": "Yes"},
|
||||
}
|
||||
pos = "X"
|
||||
morphology = dict()
|
||||
unmatched = set()
|
||||
grams = oc_tag.replace(" ", ",").split(",")
|
||||
for gram in grams:
|
||||
match = False
|
||||
for categ, gmap in sorted(gram_map.items()):
|
||||
if gram in gmap:
|
||||
match = True
|
||||
if categ == "_POS":
|
||||
pos = gmap[gram]
|
||||
else:
|
||||
morphology[categ] = gmap[gram]
|
||||
if not match:
|
||||
unmatched.add(gram)
|
||||
while len(unmatched) > 0:
|
||||
gram = unmatched.pop()
|
||||
if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
|
||||
pos = "PROPN"
|
||||
elif gram == "Auxt":
|
||||
pos = "AUX"
|
||||
elif gram == "Pltm":
|
||||
morphology["Number"] = "Ptan"
|
||||
return pos, morphology
|
||||
if UkrainianLemmatizer._morph is None:
|
||||
UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
|
||||
|
|
|
@ -29,7 +29,6 @@ from .lang.punctuation import TOKENIZER_INFIXES
|
|||
from .tokens import Doc
|
||||
from .lookups import load_lookups
|
||||
from .tokenizer import Tokenizer
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .errors import Errors, Warnings
|
||||
from .schemas import ConfigSchema
|
||||
from .git_info import GIT_VERSION
|
||||
|
@ -87,22 +86,6 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
|||
return tokenizer_factory
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.Lemmatizer.v1")
|
||||
def create_lemmatizer() -> Callable[["Language"], "Lemmatizer"]:
|
||||
"""Registered function to create a lemmatizer. Returns a factory that takes
|
||||
the nlp object and returns a Lemmatizer instance with data loaded in from
|
||||
spacy-lookups-data, if the package is installed.
|
||||
"""
|
||||
# TODO: Will be replaced when the lemmatizer becomes a pipeline component
|
||||
tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
||||
|
||||
def lemmatizer_factory(nlp: "Language") -> "Lemmatizer":
|
||||
lookups = load_lookups(lang=nlp.lang, tables=tables, strict=False)
|
||||
return Lemmatizer(lookups=lookups)
|
||||
|
||||
return lemmatizer_factory
|
||||
|
||||
|
||||
class Language:
|
||||
"""A text-processing pipeline. Usually you'll load this once per process,
|
||||
and pass the instance around your application.
|
||||
|
@ -128,7 +111,6 @@ class Language:
|
|||
max_length: int = 10 ** 6,
|
||||
meta: Dict[str, Any] = {},
|
||||
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
|
||||
create_lemmatizer: Optional[Callable[["Language"], Callable]] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""Initialise a Language object.
|
||||
|
@ -146,8 +128,6 @@ class Language:
|
|||
100,000 characters in one text.
|
||||
create_tokenizer (Callable): Function that takes the nlp object and
|
||||
returns a tokenizer.
|
||||
create_lemmatizer (Callable): Function that takes the nlp object and
|
||||
returns a lemmatizer.
|
||||
|
||||
DOCS: https://spacy.io/api/language#init
|
||||
"""
|
||||
|
@ -166,13 +146,9 @@ class Language:
|
|||
|
||||
if vocab is True:
|
||||
vectors_name = meta.get("vectors", {}).get("name")
|
||||
if not create_lemmatizer:
|
||||
lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]}
|
||||
create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
|
||||
vocab = create_vocab(
|
||||
self.lang,
|
||||
self.Defaults,
|
||||
lemmatizer=create_lemmatizer(self),
|
||||
vectors_name=vectors_name,
|
||||
load_data=self._config["nlp"]["load_vocab_data"],
|
||||
)
|
||||
|
@ -1451,7 +1427,6 @@ class Language:
|
|||
filled["components"] = orig_pipeline
|
||||
config["components"] = orig_pipeline
|
||||
create_tokenizer = resolved["nlp"]["tokenizer"]
|
||||
create_lemmatizer = resolved["nlp"]["lemmatizer"]
|
||||
before_creation = resolved["nlp"]["before_creation"]
|
||||
after_creation = resolved["nlp"]["after_creation"]
|
||||
after_pipeline_creation = resolved["nlp"]["after_pipeline_creation"]
|
||||
|
@ -1467,7 +1442,6 @@ class Language:
|
|||
nlp = lang_cls(
|
||||
vocab=vocab,
|
||||
create_tokenizer=create_tokenizer,
|
||||
create_lemmatizer=create_lemmatizer,
|
||||
)
|
||||
if after_creation is not None:
|
||||
nlp = after_creation(nlp)
|
||||
|
|
|
@ -1,145 +0,0 @@
|
|||
from typing import Optional, Callable, List, Dict
|
||||
|
||||
from .lookups import Lookups
|
||||
from .parts_of_speech import NAMES as UPOS_NAMES
|
||||
|
||||
|
||||
class Lemmatizer:
|
||||
"""
|
||||
The Lemmatizer supports simple part-of-speech-sensitive suffix rules and
|
||||
lookup tables.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
lookups: Optional[Lookups] = None,
|
||||
is_base_form: Optional[Callable] = None,
|
||||
) -> None:
|
||||
"""Initialize a Lemmatizer.
|
||||
|
||||
lookups (Lookups): The lookups object containing the (optional) tables
|
||||
"lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
|
||||
"""
|
||||
self.lookups = lookups if lookups is not None else Lookups()
|
||||
self.is_base_form = is_base_form
|
||||
|
||||
def __call__(
|
||||
self, string: str, univ_pos: str, morphology: Optional[dict] = None
|
||||
) -> List[str]:
|
||||
"""Lemmatize a string.
|
||||
|
||||
string (str): The string to lemmatize, e.g. the token text.
|
||||
univ_pos (str / int): The token's universal part-of-speech tag.
|
||||
morphology (dict): The token's morphological features following the
|
||||
Universal Dependencies scheme.
|
||||
RETURNS (list): The available lemmas for the string.
|
||||
"""
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
if "lemma_rules" not in self.lookups:
|
||||
return [lookup_table.get(string, string)]
|
||||
if isinstance(univ_pos, int):
|
||||
univ_pos = UPOS_NAMES.get(univ_pos, "X")
|
||||
univ_pos = univ_pos.lower()
|
||||
if univ_pos in ("", "eol", "space"):
|
||||
return [string.lower()]
|
||||
# See Issue #435 for example of where this logic is requied.
|
||||
if callable(self.is_base_form) and self.is_base_form(univ_pos, morphology):
|
||||
return [string.lower()]
|
||||
index_table = self.lookups.get_table("lemma_index", {})
|
||||
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||
if not any(
|
||||
(
|
||||
index_table.get(univ_pos),
|
||||
exc_table.get(univ_pos),
|
||||
rules_table.get(univ_pos),
|
||||
)
|
||||
):
|
||||
if univ_pos == "propn":
|
||||
return [string]
|
||||
else:
|
||||
return [string.lower()]
|
||||
lemmas = self.lemmatize(
|
||||
string,
|
||||
index_table.get(univ_pos, {}),
|
||||
exc_table.get(univ_pos, {}),
|
||||
rules_table.get(univ_pos, []),
|
||||
)
|
||||
return lemmas
|
||||
|
||||
def noun(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||
return self(string, "noun", morphology)
|
||||
|
||||
def verb(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||
return self(string, "verb", morphology)
|
||||
|
||||
def adj(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||
return self(string, "adj", morphology)
|
||||
|
||||
def det(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||
return self(string, "det", morphology)
|
||||
|
||||
def pron(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||
return self(string, "pron", morphology)
|
||||
|
||||
def adp(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||
return self(string, "adp", morphology)
|
||||
|
||||
def num(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||
return self(string, "num", morphology)
|
||||
|
||||
def punct(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||
return self(string, "punct", morphology)
|
||||
|
||||
def lookup(self, string: str, orth: Optional[int] = None) -> str:
|
||||
"""Look up a lemma in the table, if available. If no lemma is found,
|
||||
the original string is returned.
|
||||
|
||||
string (str): The original string.
|
||||
orth (int): Optional hash of the string to look up. If not set, the
|
||||
string will be used and hashed.
|
||||
RETURNS (str): The lemma if the string was found, otherwise the
|
||||
original string.
|
||||
"""
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
key = orth if orth is not None else string
|
||||
if key in lookup_table:
|
||||
return lookup_table[key]
|
||||
return string
|
||||
|
||||
def lemmatize(
|
||||
self,
|
||||
string: str,
|
||||
index: Dict[str, List[str]],
|
||||
exceptions: Dict[str, Dict[str, List[str]]],
|
||||
rules: Dict[str, List[List[str]]],
|
||||
) -> List[str]:
|
||||
orig = string
|
||||
string = string.lower()
|
||||
forms = []
|
||||
oov_forms = []
|
||||
for old, new in rules:
|
||||
if string.endswith(old):
|
||||
form = string[: len(string) - len(old)] + new
|
||||
if not form:
|
||||
pass
|
||||
elif form in index or not form.isalpha():
|
||||
forms.append(form)
|
||||
else:
|
||||
oov_forms.append(form)
|
||||
# Remove duplicates but preserve the ordering of applied "rules"
|
||||
forms = list(dict.fromkeys(forms))
|
||||
# Put exceptions at the front of the list, so they get priority.
|
||||
# This is a dodgy heuristic -- but it's the best we can do until we get
|
||||
# frequencies on this. We can at least prune out problematic exceptions,
|
||||
# if they shadow more frequent analyses.
|
||||
for form in exceptions.get(string, []):
|
||||
if form not in forms:
|
||||
forms.insert(0, form)
|
||||
if not forms:
|
||||
forms.extend(oov_forms)
|
||||
if not forms:
|
||||
forms.append(orig)
|
||||
return forms
|
304
spacy/lookups.py
304
spacy/lookups.py
|
@ -28,6 +28,8 @@ def load_lookups(
|
|||
# TODO: import spacy_lookups_data instead of going via entry points here?
|
||||
lookups = Lookups()
|
||||
if lang not in registry.lookups:
|
||||
if strict and len(tables) > 0:
|
||||
raise ValueError(Errors.E955.format(table=", ".join(tables), lang=lang))
|
||||
return lookups
|
||||
data = registry.lookups.get(lang)
|
||||
for table in tables:
|
||||
|
@ -41,152 +43,6 @@ def load_lookups(
|
|||
return lookups
|
||||
|
||||
|
||||
class Lookups:
|
||||
"""Container for large lookup tables and dictionaries, e.g. lemmatization
|
||||
data or tokenizer exception lists. Lookups are available via vocab.lookups,
|
||||
so they can be accessed before the pipeline components are applied (e.g.
|
||||
in the tokenizer and lemmatizer), as well as within the pipeline components
|
||||
via doc.vocab.lookups.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize the Lookups object.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#init
|
||||
"""
|
||||
self._tables = {}
|
||||
|
||||
def __contains__(self, name: str) -> bool:
|
||||
"""Check if the lookups contain a table of a given name. Delegates to
|
||||
Lookups.has_table.
|
||||
|
||||
name (str): Name of the table.
|
||||
RETURNS (bool): Whether a table of that name is in the lookups.
|
||||
"""
|
||||
return self.has_table(name)
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""RETURNS (int): The number of tables in the lookups."""
|
||||
return len(self._tables)
|
||||
|
||||
@property
|
||||
def tables(self) -> List[str]:
|
||||
"""RETURNS (List[str]): Names of all tables in the lookups."""
|
||||
return list(self._tables.keys())
|
||||
|
||||
def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> "Table":
|
||||
"""Add a new table to the lookups. Raises an error if the table exists.
|
||||
|
||||
name (str): Unique name of table.
|
||||
data (dict): Optional data to add to the table.
|
||||
RETURNS (Table): The newly added table.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#add_table
|
||||
"""
|
||||
if name in self.tables:
|
||||
raise ValueError(Errors.E158.format(name=name))
|
||||
table = Table(name=name, data=data)
|
||||
self._tables[name] = table
|
||||
return table
|
||||
|
||||
def get_table(self, name: str, default: Any = UNSET) -> "Table":
|
||||
"""Get a table. Raises an error if the table doesn't exist and no
|
||||
default value is provided.
|
||||
|
||||
name (str): Name of the table.
|
||||
default (Any): Optional default value to return if table doesn't exist.
|
||||
RETURNS (Table): The table.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#get_table
|
||||
"""
|
||||
if name not in self._tables:
|
||||
if default == UNSET:
|
||||
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
|
||||
return default
|
||||
return self._tables[name]
|
||||
|
||||
def remove_table(self, name: str) -> "Table":
|
||||
"""Remove a table. Raises an error if the table doesn't exist.
|
||||
|
||||
name (str): Name of the table to remove.
|
||||
RETURNS (Table): The removed table.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#remove_table
|
||||
"""
|
||||
if name not in self._tables:
|
||||
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
|
||||
return self._tables.pop(name)
|
||||
|
||||
def has_table(self, name: str) -> bool:
|
||||
"""Check if the lookups contain a table of a given name.
|
||||
|
||||
name (str): Name of the table.
|
||||
RETURNS (bool): Whether a table of that name exists.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#has_table
|
||||
"""
|
||||
return name in self._tables
|
||||
|
||||
def to_bytes(self, **kwargs) -> bytes:
|
||||
"""Serialize the lookups to a bytestring.
|
||||
|
||||
RETURNS (bytes): The serialized Lookups.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#to_bytes
|
||||
"""
|
||||
return srsly.msgpack_dumps(self._tables)
|
||||
|
||||
def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups":
|
||||
"""Load the lookups from a bytestring.
|
||||
|
||||
bytes_data (bytes): The data to load.
|
||||
RETURNS (Lookups): The loaded Lookups.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#from_bytes
|
||||
"""
|
||||
self._tables = {}
|
||||
for key, value in srsly.msgpack_loads(bytes_data).items():
|
||||
self._tables[key] = Table(key, value)
|
||||
return self
|
||||
|
||||
def to_disk(
|
||||
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
|
||||
) -> None:
|
||||
"""Save the lookups to a directory as lookups.bin. Expects a path to a
|
||||
directory, which will be created if it doesn't exist.
|
||||
|
||||
path (str / Path): The file path.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#to_disk
|
||||
"""
|
||||
if len(self._tables):
|
||||
path = ensure_path(path)
|
||||
if not path.exists():
|
||||
path.mkdir()
|
||||
filepath = path / filename
|
||||
with filepath.open("wb") as file_:
|
||||
file_.write(self.to_bytes())
|
||||
|
||||
def from_disk(
|
||||
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
|
||||
) -> "Lookups":
|
||||
"""Load lookups from a directory containing a lookups.bin. Will skip
|
||||
loading if the file doesn't exist.
|
||||
|
||||
path (str / Path): The directory path.
|
||||
RETURNS (Lookups): The loaded lookups.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#from_disk
|
||||
"""
|
||||
path = ensure_path(path)
|
||||
filepath = path / filename
|
||||
if filepath.exists():
|
||||
with filepath.open("rb") as file_:
|
||||
data = file_.read()
|
||||
return self.from_bytes(data)
|
||||
return self
|
||||
|
||||
|
||||
class Table(OrderedDict):
|
||||
"""A table in the lookups. Subclass of builtin dict that implements a
|
||||
slightly more consistent and unified API.
|
||||
|
@ -303,3 +159,159 @@ class Table(OrderedDict):
|
|||
self.clear()
|
||||
self.update(data)
|
||||
return self
|
||||
|
||||
|
||||
class Lookups:
|
||||
"""Container for large lookup tables and dictionaries, e.g. lemmatization
|
||||
data or tokenizer exception lists. Lookups are available via vocab.lookups,
|
||||
so they can be accessed before the pipeline components are applied (e.g.
|
||||
in the tokenizer and lemmatizer), as well as within the pipeline components
|
||||
via doc.vocab.lookups.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize the Lookups object.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#init
|
||||
"""
|
||||
self._tables = {}
|
||||
|
||||
def __contains__(self, name: str) -> bool:
|
||||
"""Check if the lookups contain a table of a given name. Delegates to
|
||||
Lookups.has_table.
|
||||
|
||||
name (str): Name of the table.
|
||||
RETURNS (bool): Whether a table of that name is in the lookups.
|
||||
"""
|
||||
return self.has_table(name)
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""RETURNS (int): The number of tables in the lookups."""
|
||||
return len(self._tables)
|
||||
|
||||
@property
|
||||
def tables(self) -> List[str]:
|
||||
"""RETURNS (List[str]): Names of all tables in the lookups."""
|
||||
return list(self._tables.keys())
|
||||
|
||||
def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> Table:
|
||||
"""Add a new table to the lookups. Raises an error if the table exists.
|
||||
|
||||
name (str): Unique name of table.
|
||||
data (dict): Optional data to add to the table.
|
||||
RETURNS (Table): The newly added table.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#add_table
|
||||
"""
|
||||
if name in self.tables:
|
||||
raise ValueError(Errors.E158.format(name=name))
|
||||
table = Table(name=name, data=data)
|
||||
self._tables[name] = table
|
||||
return table
|
||||
|
||||
def set_table(self, name: str, table: Table) -> None:
|
||||
"""Set a table.
|
||||
|
||||
name (str): Name of the table to set.
|
||||
table (Table): The Table to set.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#set_table
|
||||
"""
|
||||
self._tables[name] = table
|
||||
|
||||
def get_table(self, name: str, default: Any = UNSET) -> Table:
|
||||
"""Get a table. Raises an error if the table doesn't exist and no
|
||||
default value is provided.
|
||||
|
||||
name (str): Name of the table.
|
||||
default (Any): Optional default value to return if table doesn't exist.
|
||||
RETURNS (Table): The table.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#get_table
|
||||
"""
|
||||
if name not in self._tables:
|
||||
if default == UNSET:
|
||||
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
|
||||
return default
|
||||
return self._tables[name]
|
||||
|
||||
def remove_table(self, name: str) -> Table:
|
||||
"""Remove a table. Raises an error if the table doesn't exist.
|
||||
|
||||
name (str): Name of the table to remove.
|
||||
RETURNS (Table): The removed table.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#remove_table
|
||||
"""
|
||||
if name not in self._tables:
|
||||
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
|
||||
return self._tables.pop(name)
|
||||
|
||||
def has_table(self, name: str) -> bool:
|
||||
"""Check if the lookups contain a table of a given name.
|
||||
|
||||
name (str): Name of the table.
|
||||
RETURNS (bool): Whether a table of that name exists.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#has_table
|
||||
"""
|
||||
return name in self._tables
|
||||
|
||||
def to_bytes(self, **kwargs) -> bytes:
|
||||
"""Serialize the lookups to a bytestring.
|
||||
|
||||
RETURNS (bytes): The serialized Lookups.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#to_bytes
|
||||
"""
|
||||
return srsly.msgpack_dumps(self._tables)
|
||||
|
||||
def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups":
|
||||
"""Load the lookups from a bytestring.
|
||||
|
||||
bytes_data (bytes): The data to load.
|
||||
RETURNS (Lookups): The loaded Lookups.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#from_bytes
|
||||
"""
|
||||
self._tables = {}
|
||||
for key, value in srsly.msgpack_loads(bytes_data).items():
|
||||
self._tables[key] = Table(key, value)
|
||||
return self
|
||||
|
||||
def to_disk(
|
||||
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
|
||||
) -> None:
|
||||
"""Save the lookups to a directory as lookups.bin. Expects a path to a
|
||||
directory, which will be created if it doesn't exist.
|
||||
|
||||
path (str / Path): The file path.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#to_disk
|
||||
"""
|
||||
if len(self._tables):
|
||||
path = ensure_path(path)
|
||||
if not path.exists():
|
||||
path.mkdir()
|
||||
filepath = path / filename
|
||||
with filepath.open("wb") as file_:
|
||||
file_.write(self.to_bytes())
|
||||
|
||||
def from_disk(
|
||||
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
|
||||
) -> "Lookups":
|
||||
"""Load lookups from a directory containing a lookups.bin. Will skip
|
||||
loading if the file doesn't exist.
|
||||
|
||||
path (str / Path): The directory path.
|
||||
RETURNS (Lookups): The loaded lookups.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#from_disk
|
||||
"""
|
||||
path = ensure_path(path)
|
||||
filepath = path / filename
|
||||
if filepath.exists():
|
||||
with filepath.open("rb") as file_:
|
||||
data = file_.read()
|
||||
return self.from_bytes(data)
|
||||
return self
|
||||
|
|
|
@ -1,20 +1,73 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, List
|
||||
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
|
||||
from thinc.types import Floats2d
|
||||
|
||||
from ...util import registry
|
||||
from .._precomputable_affine import PrecomputableAffine
|
||||
from ..tb_framework import TransitionModel
|
||||
from ...tokens import Doc
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.TransitionBasedParser.v1")
|
||||
def build_tb_parser_model(
|
||||
tok2vec: Model,
|
||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||
nr_feature_tokens: int,
|
||||
hidden_width: int,
|
||||
maxout_pieces: int,
|
||||
use_upper: bool = True,
|
||||
nO: Optional[int] = None,
|
||||
) -> Model:
|
||||
"""
|
||||
Build a transition-based parser model. Can apply to NER or dependency-parsing.
|
||||
|
||||
Transition-based parsing is an approach to structured prediction where the
|
||||
task of predicting the structure is mapped to a series of state transitions.
|
||||
You might find this tutorial helpful as background:
|
||||
https://explosion.ai/blog/parsing-english-in-python
|
||||
|
||||
The neural network state prediction model consists of either two or three
|
||||
subnetworks:
|
||||
|
||||
* tok2vec: Map each token into a vector representations. This subnetwork
|
||||
is run once for each batch.
|
||||
* lower: Construct a feature-specific vector for each (token, feature) pair.
|
||||
This is also run once for each batch. Constructing the state
|
||||
representation is then simply a matter of summing the component features
|
||||
and applying the non-linearity.
|
||||
* upper (optional): A feed-forward network that predicts scores from the
|
||||
state representation. If not present, the output from the lower model is
|
||||
used as action scores directly.
|
||||
|
||||
tok2vec (Model[List[Doc], List[Floats2d]]):
|
||||
Subnetwork to map tokens into vector representations.
|
||||
nr_feature_tokens (int): The number of tokens in the context to use to
|
||||
construct the state vector. Valid choices are 1, 2, 3, 6, 8 and 13. The
|
||||
2, 8 and 13 feature sets are designed for the parser, while the 3 and 6
|
||||
feature sets are designed for the NER. The recommended feature sets are
|
||||
3 for NER, and 8 for the dependency parser.
|
||||
|
||||
TODO: This feature should be split into two, state_type: ["deps", "ner"]
|
||||
and extra_state_features: [True, False]. This would map into:
|
||||
|
||||
(deps, False): 8
|
||||
(deps, True): 13
|
||||
(ner, False): 3
|
||||
(ner, True): 6
|
||||
|
||||
hidden_width (int): The width of the hidden layer.
|
||||
maxout_pieces (int): How many pieces to use in the state prediction layer.
|
||||
Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
|
||||
is replaced with a ReLu non-linearity if use_upper=True, and no
|
||||
non-linearity if use_upper=False.
|
||||
use_upper (bool): Whether to use an additional hidden layer after the state
|
||||
vector in order to predict the action scores. It is recommended to set
|
||||
this to False for large pretrained models such as transformers, and False
|
||||
for smaller networks. The upper layer is computed on CPU, which becomes
|
||||
a bottleneck on larger GPU-based models, where it's also less necessary.
|
||||
nO (int or None): The number of actions the model will predict between.
|
||||
Usually inferred from data at the beginning of training, or loaded from
|
||||
disk.
|
||||
"""
|
||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||
tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),)
|
||||
tok2vec.set_dim("nO", hidden_width)
|
||||
|
|
|
@ -10,10 +10,24 @@ from .._iob import IOB
|
|||
from ...util import registry
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.BiluoTagger.v1")
|
||||
@registry.architectures.register("spacy.BILUOTagger.v1")
|
||||
def BiluoTagger(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]]
|
||||
) -> Model[List[Doc], List[Floats2d]]:
|
||||
"""Construct a simple NER tagger, that predicts BILUO tag scores for each
|
||||
token and uses greedy decoding with transition-constraints to return a valid
|
||||
BILUO tag sequence.
|
||||
|
||||
A BILUO tag sequence encodes a sequence of non-overlapping labelled spans
|
||||
into tags assigned to each token. The first token of a span is given the
|
||||
tag B-LABEL, the last token of the span is given the tag L-LABEL, and tokens
|
||||
within the span are given the tag U-LABEL. Single-token spans are given
|
||||
the tag U-LABEL. All other tokens are assigned the tag O.
|
||||
|
||||
The BILUO tag scheme generally results in better linear separation between
|
||||
classes, especially for non-CRF models, because there are more distinct classes
|
||||
for the different situations (Ratinov et al., 2009).
|
||||
"""
|
||||
biluo = BILUO()
|
||||
linear = Linear(
|
||||
nO=None, nI=tok2vec.get_dim("nO"), init_W=configure_normal_init(mean=0.02)
|
||||
|
@ -41,6 +55,15 @@ def BiluoTagger(
|
|||
def IOBTagger(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]]
|
||||
) -> Model[List[Doc], List[Floats2d]]:
|
||||
"""Construct a simple NER tagger, that predicts IOB tag scores for each
|
||||
token and uses greedy decoding with transition-constraints to return a valid
|
||||
IOB tag sequence.
|
||||
|
||||
An IOB tag sequence encodes a sequence of non-overlapping labelled spans
|
||||
into tags assigned to each token. The first token of a span is given the
|
||||
tag B-LABEL, and subsequent tokens are given the tag I-LABEL.
|
||||
All other tokens are assigned the tag O.
|
||||
"""
|
||||
biluo = IOB()
|
||||
linear = Linear(nO=None, nI=tok2vec.get_dim("nO"))
|
||||
model = chain(
|
||||
|
|
|
@ -1,11 +1,22 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, List
|
||||
from thinc.api import zero_init, with_array, Softmax, chain, Model
|
||||
from thinc.types import Floats2d
|
||||
|
||||
from ...util import registry
|
||||
from ...tokens import Doc
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.Tagger.v1")
|
||||
def build_tagger_model(tok2vec: Model, nO: Optional[int] = None) -> Model:
|
||||
def build_tagger_model(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None
|
||||
) -> Model[List[Doc], List[Floats2d]]:
|
||||
"""Build a tagger model, using a provided token-to-vector component. The tagger
|
||||
model simply adds a linear layer with softmax activation to predict scores
|
||||
given the token vectors.
|
||||
|
||||
tok2vec (Model[List[Doc], List[Floats2d]]): The token-to-vector subnetwork.
|
||||
nO (int or None): The number of tags to output. Inferred from the data if None.
|
||||
"""
|
||||
# TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
|
||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||
output_layer = Softmax(nO, t2v_width, init_W=zero_init)
|
||||
|
|
|
@ -45,6 +45,7 @@ def build_bow_text_classifier(
|
|||
no_output_layer: bool,
|
||||
nO: Optional[int] = None,
|
||||
) -> Model:
|
||||
# Don't document this yet, I'm not sure it's right.
|
||||
with Model.define_operators({">>": chain}):
|
||||
sparse_linear = SparseLinear(nO)
|
||||
model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
|
||||
|
@ -69,6 +70,7 @@ def build_text_classifier(
|
|||
dropout: Optional[float],
|
||||
nO: Optional[int] = None,
|
||||
) -> Model:
|
||||
# Don't document this yet, I'm not sure it's right.
|
||||
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||
lower = HashEmbed(
|
||||
|
@ -160,6 +162,7 @@ def build_text_classifier_lowdata(
|
|||
dropout: Optional[float],
|
||||
nO: Optional[int] = None,
|
||||
) -> Model:
|
||||
# Don't document this yet, I'm not sure it's right.
|
||||
# Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
|
||||
with Model.define_operators({">>": chain, "**": clone}):
|
||||
model = (
|
||||
|
|
|
@ -28,11 +28,31 @@ def build_hash_embed_cnn_tok2vec(
|
|||
window_size: int,
|
||||
maxout_pieces: int,
|
||||
subword_features: bool,
|
||||
dropout: Optional[float],
|
||||
pretrained_vectors: Optional[bool]
|
||||
) -> Model[List[Doc], List[Floats2d]]:
|
||||
"""Build spaCy's 'standard' tok2vec layer, which uses hash embedding
|
||||
with subword features and a CNN with layer-normalized maxout."""
|
||||
with subword features and a CNN with layer-normalized maxout.
|
||||
|
||||
width (int): The width of the input and output. These are required to be the
|
||||
same, so that residual connections can be used. Recommended values are
|
||||
96, 128 or 300.
|
||||
depth (int): The number of convolutional layers to use. Recommended values
|
||||
are between 2 and 8.
|
||||
window_size (int): The number of tokens on either side to concatenate during
|
||||
the convolutions. The receptive field of the CNN will be
|
||||
depth * (window_size * 2 + 1), so a 4-layer network with window_size of
|
||||
2 will be sensitive to 17 words at a time. Recommended value is 1.
|
||||
embed_size (int): The number of rows in the hash embedding tables. This can
|
||||
be surprisingly small, due to the use of the hash embeddings. Recommended
|
||||
values are between 2000 and 10000.
|
||||
maxout_pieces (int): The number of pieces to use in the maxout non-linearity.
|
||||
If 1, the Mish non-linearity is used instead. Recommended values are 1-3.
|
||||
subword_features (bool): Whether to also embed subword features, specifically
|
||||
the prefix, suffix and word shape. This is recommended for alphabetic
|
||||
languages like English, but not if single-character tokens are used for
|
||||
a language such as Chinese.
|
||||
pretrained_vectors (bool): Whether to also use static vectors.
|
||||
"""
|
||||
return build_Tok2Vec_model(
|
||||
embed=MultiHashEmbed(
|
||||
width=width,
|
||||
|
@ -54,7 +74,14 @@ def build_Tok2Vec_model(
|
|||
embed: Model[List[Doc], List[Floats2d]],
|
||||
encode: Model[List[Floats2d], List[Floats2d]],
|
||||
) -> Model[List[Doc], List[Floats2d]]:
|
||||
"""Construct a tok2vec model out of embedding and encoding subnetworks.
|
||||
See https://explosion.ai/blog/deep-learning-formula-nlp
|
||||
|
||||
embed (Model[List[Doc], List[Floats2d]]): Embed tokens into context-independent
|
||||
word vector representations.
|
||||
encode (Model[List[Floats2d], List[Floats2d]]): Encode context into the
|
||||
embeddings, using an architecture such as a CNN, BiLSTM or transformer.
|
||||
"""
|
||||
receptive_field = encode.attrs.get("receptive_field", 0)
|
||||
tok2vec = chain(embed, with_array(encode, pad=receptive_field))
|
||||
tok2vec.set_dim("nO", encode.get_dim("nO"))
|
||||
|
@ -67,6 +94,27 @@ def build_Tok2Vec_model(
|
|||
def MultiHashEmbed(
|
||||
width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
|
||||
):
|
||||
"""Construct an embedding layer that separately embeds a number of lexical
|
||||
attributes using hash embedding, concatenates the results, and passes it
|
||||
through a feed-forward subnetwork to build a mixed representations.
|
||||
|
||||
The features used are the NORM, PREFIX, SUFFIX and SHAPE, which can have
|
||||
varying definitions depending on the Vocab of the Doc object passed in.
|
||||
Vectors from pretrained static vectors can also be incorporated into the
|
||||
concatenated representation.
|
||||
|
||||
width (int): The output width. Also used as the width of the embedding tables.
|
||||
Recommended values are between 64 and 300.
|
||||
rows (int): The number of rows for the embedding tables. Can be low, due
|
||||
to the hashing trick. Embeddings for prefix, suffix and word shape
|
||||
use half as many rows. Recommended values are between 2000 and 10000.
|
||||
also_embed_subwords (bool): Whether to use the PREFIX, SUFFIX and SHAPE
|
||||
features in the embeddings. If not using these, you may need more
|
||||
rows in your hash embeddings, as there will be increased chance of
|
||||
collisions.
|
||||
also_use_static_vectors (bool): Whether to also use static word vectors.
|
||||
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||
"""
|
||||
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
seed = 7
|
||||
|
||||
|
@ -117,6 +165,30 @@ def MultiHashEmbed(
|
|||
|
||||
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
||||
def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
|
||||
"""Construct an embedded representations based on character embeddings, using
|
||||
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
||||
each word, taken from the beginning and end of the word equally. Padding is
|
||||
used in the centre for words that are too short.
|
||||
|
||||
For instance, let's say nC=4, and the word is "jumping". The characters
|
||||
used will be jung (two from the start, two from the end). If we had nC=8,
|
||||
the characters would be "jumpping": 4 from the start, 4 from the end. This
|
||||
ensures that the final character is always in the last position, instead
|
||||
of being in an arbitrary position depending on the word length.
|
||||
|
||||
The characters are embedded in a embedding table with 256 rows, and the
|
||||
vectors concatenated. A hash-embedded vector of the NORM of the word is
|
||||
also concatenated on, and the result is then passed through a feed-forward
|
||||
network to construct a single vector to represent the information.
|
||||
|
||||
width (int): The width of the output vector and the NORM hash embedding.
|
||||
rows (int): The number of rows in the NORM hash embedding table.
|
||||
nM (int): The dimensionality of the character embeddings. Recommended values
|
||||
are between 16 and 64.
|
||||
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
||||
are between 3 and 8, although it may depend on the length of words in the
|
||||
language.
|
||||
"""
|
||||
model = chain(
|
||||
concatenate(
|
||||
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||
|
@ -133,7 +205,21 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
|
||||
def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth: int):
|
||||
def MaxoutWindowEncoder(
|
||||
width: int, window_size: int, maxout_pieces: int, depth: int
|
||||
) -> Model[List[Floats2d], List[Floats2d]]:
|
||||
"""Encode context using convolutions with maxout activation, layer
|
||||
normalization and residual connections.
|
||||
|
||||
width (int): The input and output width. These are required to be the same,
|
||||
to allow residual connections. This value will be determined by the
|
||||
width of the inputs. Recommended values are between 64 and 300.
|
||||
window_size (int): The number of words to concatenate around each token
|
||||
to construct the convolution. Recommended value is 1.
|
||||
maxout_pieces (int): The number of maxout pieces to use. Recommended
|
||||
values are 2 or 3.
|
||||
depth (int): The number of convolutional layers. Recommended value is 4.
|
||||
"""
|
||||
cnn = chain(
|
||||
expand_window(window_size=window_size),
|
||||
Maxout(
|
||||
|
@ -151,7 +237,19 @@ def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth:
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.MishWindowEncoder.v1")
|
||||
def MishWindowEncoder(width, window_size, depth):
|
||||
def MishWindowEncoder(
|
||||
width: int, window_size: int, depth: int
|
||||
) -> Model[List[Floats2d], List[Floats2d]]:
|
||||
"""Encode context using convolutions with mish activation, layer
|
||||
normalization and residual connections.
|
||||
|
||||
width (int): The input and output width. These are required to be the same,
|
||||
to allow residual connections. This value will be determined by the
|
||||
width of the inputs. Recommended values are between 64 and 300.
|
||||
window_size (int): The number of words to concatenate around each token
|
||||
to construct the convolution. Recommended value is 1.
|
||||
depth (int): The number of convolutional layers. Recommended value is 4.
|
||||
"""
|
||||
cnn = chain(
|
||||
expand_window(window_size=window_size),
|
||||
Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
|
||||
|
@ -162,7 +260,18 @@ def MishWindowEncoder(width, window_size, depth):
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
|
||||
def BiLSTMEncoder(width, depth, dropout):
|
||||
def BiLSTMEncoder(
|
||||
width: int, depth: int, dropout: float
|
||||
) -> Model[List[Floats2d], List[Floats2d]]:
|
||||
"""Encode context using bidirectonal LSTM layers. Requires PyTorch.
|
||||
|
||||
width (int): The input and output width. These are required to be the same,
|
||||
to allow residual connections. This value will be determined by the
|
||||
width of the inputs. Recommended values are between 64 and 300.
|
||||
window_size (int): The number of words to concatenate around each token
|
||||
to construct the convolution. Recommended value is 1.
|
||||
depth (int): The number of convolutional layers. Recommended value is 4.
|
||||
"""
|
||||
if depth == 0:
|
||||
return noop()
|
||||
return with_padded(PyTorchLSTM(width, width, bi=True, depth=depth, dropout=dropout))
|
||||
|
|
|
@ -27,12 +27,6 @@ cdef class Morphology:
|
|||
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
|
||||
cdef int insert(self, MorphAnalysisC tag) except -1
|
||||
|
||||
cdef int assign_untagged(self, TokenC* token) except -1
|
||||
cdef int assign_tag(self, TokenC* token, tag) except -1
|
||||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
|
||||
|
||||
cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1
|
||||
|
||||
|
||||
cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
|
||||
cdef list list_features(const MorphAnalysisC* morph)
|
||||
|
|
|
@ -31,43 +31,15 @@ cdef class Morphology:
|
|||
VALUE_SEP = ","
|
||||
EMPTY_MORPH = "_" # not an empty string so that the PreshMap key is not 0
|
||||
|
||||
def __init__(self, StringStore strings, tag_map, lemmatizer, exc=None):
|
||||
def __init__(self, StringStore strings):
|
||||
self.mem = Pool()
|
||||
self.strings = strings
|
||||
self.tags = PreshMap()
|
||||
self.load_tag_map(tag_map)
|
||||
self.lemmatizer = lemmatizer
|
||||
|
||||
self._cache = PreshMapArray(self.n_tags)
|
||||
self._exc = {}
|
||||
if exc is not None:
|
||||
self.load_morph_exceptions(exc)
|
||||
|
||||
def load_tag_map(self, tag_map):
|
||||
self.tag_map = {}
|
||||
self.reverse_index = {}
|
||||
# Add special space symbol. We prefix with underscore, to make sure it
|
||||
# always sorts to the end.
|
||||
if '_SP' in tag_map:
|
||||
space_attrs = tag_map.get('_SP')
|
||||
else:
|
||||
space_attrs = tag_map.get('SP', {POS: SPACE})
|
||||
if '_SP' not in tag_map:
|
||||
self.strings.add('_SP')
|
||||
tag_map = dict(tag_map)
|
||||
tag_map['_SP'] = space_attrs
|
||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||
attrs = self.normalize_attrs(attrs)
|
||||
self.add(attrs)
|
||||
self.tag_map[tag_str] = dict(attrs)
|
||||
self.reverse_index[self.strings.add(tag_str)] = i
|
||||
self.tag_names = tuple(sorted(self.tag_map.keys()))
|
||||
self.n_tags = len(self.tag_map)
|
||||
self._cache = PreshMapArray(self.n_tags)
|
||||
|
||||
def __reduce__(self):
|
||||
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
|
||||
self.exc), None, None)
|
||||
tags = set([self.get(self.strings[s]) for s in self.strings])
|
||||
tags -= set([""])
|
||||
return (unpickle_morphology, (self.strings, sorted(tags)), None, None)
|
||||
|
||||
def add(self, features):
|
||||
"""Insert a morphological analysis in the morphology table, if not
|
||||
|
@ -185,115 +157,6 @@ cdef class Morphology:
|
|||
else:
|
||||
return self.strings[tag.key]
|
||||
|
||||
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
|
||||
if orth not in self.strings:
|
||||
return orth
|
||||
cdef unicode py_string = self.strings[orth]
|
||||
if self.lemmatizer is None:
|
||||
return self.strings.add(py_string.lower())
|
||||
cdef list lemma_strings
|
||||
cdef unicode lemma_string
|
||||
# Normalize features into a dict keyed by the field, to make life easier
|
||||
# for the lemmatizer. Handles string-to-int conversion too.
|
||||
string_feats = {}
|
||||
for key, value in morphology.items():
|
||||
if value is True:
|
||||
name, value = self.strings.as_string(key).split('_', 1)
|
||||
string_feats[name] = value
|
||||
else:
|
||||
string_feats[self.strings.as_string(key)] = self.strings.as_string(value)
|
||||
lemma_strings = self.lemmatizer(py_string, univ_pos, string_feats)
|
||||
lemma_string = lemma_strings[0]
|
||||
lemma = self.strings.add(lemma_string)
|
||||
return lemma
|
||||
|
||||
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
|
||||
force=False):
|
||||
"""Add a special-case rule to the morphological analyser. Tokens whose
|
||||
tag and orth match the rule will receive the specified properties.
|
||||
|
||||
tag (str): The part-of-speech tag to key the exception.
|
||||
orth (str): The word-form to key the exception.
|
||||
"""
|
||||
attrs = dict(attrs)
|
||||
attrs = self.normalize_attrs(attrs)
|
||||
self.add(attrs)
|
||||
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
||||
self._exc[(tag_str, self.strings.add(orth_str))] = attrs
|
||||
|
||||
cdef int assign_untagged(self, TokenC* token) except -1:
|
||||
"""Set morphological attributes on a token without a POS tag. Uses
|
||||
the lemmatizer's lookup() method, which looks up the string in the
|
||||
table provided by the language data as lemma_lookup (if available).
|
||||
"""
|
||||
if token.lemma == 0:
|
||||
orth_str = self.strings[token.lex.orth]
|
||||
lemma = self.lemmatizer.lookup(orth_str, orth=token.lex.orth)
|
||||
token.lemma = self.strings.add(lemma)
|
||||
|
||||
cdef int assign_tag(self, TokenC* token, tag_str) except -1:
|
||||
cdef attr_t tag = self.strings.as_int(tag_str)
|
||||
if tag in self.reverse_index:
|
||||
tag_id = self.reverse_index[tag]
|
||||
self.assign_tag_id(token, tag_id)
|
||||
else:
|
||||
token.tag = tag
|
||||
|
||||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
||||
if tag_id > self.n_tags:
|
||||
raise ValueError(Errors.E014.format(tag=tag_id))
|
||||
# Ensure spaces get tagged as space.
|
||||
# It seems pretty arbitrary to put this logic here, but there's really
|
||||
# nowhere better. I guess the justification is that this is where the
|
||||
# specific word and the tag interact. Still, we should have a better
|
||||
# way to enforce this rule, or figure out why the statistical model fails.
|
||||
# Related to Issue #220
|
||||
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
||||
tag_id = self.reverse_index[self.strings.add('_SP')]
|
||||
tag_str = self.tag_names[tag_id]
|
||||
features = dict(self.tag_map.get(tag_str, {}))
|
||||
if features:
|
||||
pos = self.strings.as_int(features.pop(POS))
|
||||
else:
|
||||
pos = 0
|
||||
cdef attr_t lemma = <attr_t>self._cache.get(tag_id, token.lex.orth)
|
||||
if lemma == 0:
|
||||
# Ugh, self.lemmatize has opposite arg order from self.lemmatizer :(
|
||||
lemma = self.lemmatize(pos, token.lex.orth, features)
|
||||
self._cache.set(tag_id, token.lex.orth, <void*>lemma)
|
||||
token.lemma = lemma
|
||||
token.pos = <univ_pos_t>pos
|
||||
token.tag = self.strings[tag_str]
|
||||
token.morph = self.add(features)
|
||||
if (self.tag_names[tag_id], token.lex.orth) in self._exc:
|
||||
self._assign_tag_from_exceptions(token, tag_id)
|
||||
|
||||
cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1:
|
||||
key = (self.tag_names[tag_id], token.lex.orth)
|
||||
cdef dict attrs
|
||||
attrs = self._exc[key]
|
||||
token.pos = attrs.get(POS, token.pos)
|
||||
token.lemma = attrs.get(LEMMA, token.lemma)
|
||||
|
||||
def load_morph_exceptions(self, dict morph_rules):
|
||||
self._exc = {}
|
||||
# Map (form, pos) to attributes
|
||||
for tag, exc in morph_rules.items():
|
||||
for orth, attrs in exc.items():
|
||||
attrs = self.normalize_attrs(attrs)
|
||||
self.add_special_case(self.strings.as_string(tag), self.strings.as_string(orth), attrs)
|
||||
|
||||
@property
|
||||
def exc(self):
|
||||
# generate the serializable exc in the MORPH_RULES format from the
|
||||
# internal tuple-key format
|
||||
morph_rules = {}
|
||||
for (tag, orth) in sorted(self._exc):
|
||||
if not tag in morph_rules:
|
||||
morph_rules[tag] = {}
|
||||
morph_rules[tag][self.strings[orth]] = self._exc[(tag, orth)]
|
||||
return morph_rules
|
||||
|
||||
@staticmethod
|
||||
def feats_to_dict(feats):
|
||||
if not feats or feats == Morphology.EMPTY_MORPH:
|
||||
|
@ -338,3 +201,9 @@ cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t fie
|
|||
results[n_results] = morph.features[i]
|
||||
n_results += 1
|
||||
return n_results
|
||||
|
||||
def unpickle_morphology(strings, tags):
|
||||
cdef Morphology morphology = Morphology(strings)
|
||||
for tag in tags:
|
||||
morphology.add(tag)
|
||||
return morphology
|
||||
|
|
|
@ -3,9 +3,10 @@ from .dep_parser import DependencyParser
|
|||
from .entity_linker import EntityLinker
|
||||
from .ner import EntityRecognizer
|
||||
from .entityruler import EntityRuler
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .morphologizer import Morphologizer
|
||||
from .pipe import Pipe
|
||||
from spacy.pipeline.senter import SentenceRecognizer
|
||||
from .senter import SentenceRecognizer
|
||||
from .sentencizer import Sentencizer
|
||||
from .simple_ner import SimpleNER
|
||||
from .tagger import Tagger
|
||||
|
@ -20,6 +21,7 @@ __all__ = [
|
|||
"EntityRecognizer",
|
||||
"EntityRuler",
|
||||
"Morphologizer",
|
||||
"Lemmatizer",
|
||||
"Pipe",
|
||||
"SentenceRecognizer",
|
||||
"Sentencizer",
|
||||
|
|
|
@ -17,13 +17,18 @@ MatcherPatternType = List[Dict[Union[int, str], Any]]
|
|||
AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]]
|
||||
|
||||
|
||||
@Language.factory("attribute_ruler")
|
||||
@Language.factory(
|
||||
"attribute_ruler", default_config={"pattern_dicts": None, "validate": False}
|
||||
)
|
||||
def make_attribute_ruler(
|
||||
nlp: Language,
|
||||
name: str,
|
||||
pattern_dicts: Optional[Iterable[AttributeRulerPatternType]] = None,
|
||||
pattern_dicts: Optional[Iterable[AttributeRulerPatternType]],
|
||||
validate: bool,
|
||||
):
|
||||
return AttributeRuler(nlp.vocab, name, pattern_dicts=pattern_dicts)
|
||||
return AttributeRuler(
|
||||
nlp.vocab, name, pattern_dicts=pattern_dicts, validate=validate
|
||||
)
|
||||
|
||||
|
||||
class AttributeRuler(Pipe):
|
||||
|
@ -39,6 +44,7 @@ class AttributeRuler(Pipe):
|
|||
name: str = "attribute_ruler",
|
||||
*,
|
||||
pattern_dicts: Optional[Iterable[AttributeRulerPatternType]] = None,
|
||||
validate: bool = False,
|
||||
) -> None:
|
||||
"""Initialize the AttributeRuler.
|
||||
|
||||
|
@ -54,7 +60,7 @@ class AttributeRuler(Pipe):
|
|||
"""
|
||||
self.name = name
|
||||
self.vocab = vocab
|
||||
self.matcher = Matcher(self.vocab)
|
||||
self.matcher = Matcher(self.vocab, validate=validate)
|
||||
self.attrs = []
|
||||
self._attrs_unnormed = [] # store for reference
|
||||
self.indices = []
|
||||
|
@ -63,7 +69,7 @@ class AttributeRuler(Pipe):
|
|||
self.add_patterns(pattern_dicts)
|
||||
|
||||
def __call__(self, doc: Doc) -> Doc:
|
||||
"""Apply the attributeruler to a Doc and set all attribute exceptions.
|
||||
"""Apply the AttributeRuler to a Doc and set all attribute exceptions.
|
||||
|
||||
doc (Doc): The document to process.
|
||||
RETURNS (Doc): The processed Doc.
|
||||
|
@ -89,9 +95,31 @@ class AttributeRuler(Pipe):
|
|||
set_token_attrs(token, attrs)
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, *, batch_size=128):
|
||||
"""Apply the pipe to a stream of documents. This usually happens under
|
||||
the hood when the nlp object is called on a text and all components are
|
||||
applied to the Doc.
|
||||
|
||||
stream (Iterable[Doc]): A stream of documents.
|
||||
batch_size (int): The number of documents to buffer.
|
||||
YIELDS (Doc): Processed documents in order.
|
||||
|
||||
DOCS: https://spacy.io/attributeruler/pipe#pipe
|
||||
"""
|
||||
for doc in stream:
|
||||
doc = self(doc)
|
||||
yield doc
|
||||
|
||||
def load_from_tag_map(
|
||||
self, tag_map: Dict[str, Dict[Union[int, str], Union[int, str]]]
|
||||
) -> None:
|
||||
"""Load attribute ruler patterns from a tag map.
|
||||
|
||||
tag_map (dict): The tag map that maps fine-grained tags to
|
||||
coarse-grained tags and morphological features.
|
||||
|
||||
DOCS: https://spacy.io/api/attributeruler#load_from_morph_rules
|
||||
"""
|
||||
for tag, attrs in tag_map.items():
|
||||
pattern = [{"TAG": tag}]
|
||||
attrs, morph_attrs = _split_morph_attrs(attrs)
|
||||
|
@ -102,6 +130,14 @@ class AttributeRuler(Pipe):
|
|||
def load_from_morph_rules(
|
||||
self, morph_rules: Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
|
||||
) -> None:
|
||||
"""Load attribute ruler patterns from morph rules.
|
||||
|
||||
morph_rules (dict): The morph rules that map token text and
|
||||
fine-grained tags to coarse-grained tags, lemmas and morphological
|
||||
features.
|
||||
|
||||
DOCS: https://spacy.io/api/attributeruler#load_from_morph_rules
|
||||
"""
|
||||
for tag in morph_rules:
|
||||
for word in morph_rules[tag]:
|
||||
pattern = [{"ORTH": word, "TAG": tag}]
|
||||
|
@ -133,11 +169,20 @@ class AttributeRuler(Pipe):
|
|||
self.indices.append(index)
|
||||
|
||||
def add_patterns(self, pattern_dicts: Iterable[AttributeRulerPatternType]) -> None:
|
||||
"""Add patterns from a list of pattern dicts with the keys as the
|
||||
arguments to AttributeRuler.add.
|
||||
pattern_dicts (Iterable[dict]): A list of pattern dicts with the keys
|
||||
as the arguments to AttributeRuler.add (patterns/attrs/index) to
|
||||
add as patterns.
|
||||
|
||||
DOCS: https://spacy.io/api/attributeruler#add_patterns
|
||||
"""
|
||||
for p in pattern_dicts:
|
||||
self.add(**p)
|
||||
|
||||
@property
|
||||
def patterns(self) -> List[AttributeRulerPatternType]:
|
||||
"""All the added patterns."""
|
||||
all_patterns = []
|
||||
for i in range(len(self.attrs)):
|
||||
p = {}
|
||||
|
@ -148,7 +193,7 @@ class AttributeRuler(Pipe):
|
|||
return all_patterns
|
||||
|
||||
def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes:
|
||||
"""Serialize the attributeruler to a bytestring.
|
||||
"""Serialize the AttributeRuler to a bytestring.
|
||||
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized object.
|
||||
|
@ -164,7 +209,7 @@ class AttributeRuler(Pipe):
|
|||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data: bytes, exclude: Iterable[str] = tuple()):
|
||||
"""Load the attributeruler from a bytestring.
|
||||
"""Load the AttributeRuler from a bytestring.
|
||||
|
||||
bytes_data (bytes): The data to load.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
|
@ -200,7 +245,7 @@ class AttributeRuler(Pipe):
|
|||
return self
|
||||
|
||||
def to_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()) -> None:
|
||||
"""Serialize the attributeruler to disk.
|
||||
"""Serialize the AttributeRuler to disk.
|
||||
|
||||
path (Union[Path, str]): A path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
|
@ -218,7 +263,7 @@ class AttributeRuler(Pipe):
|
|||
def from_disk(
|
||||
self, path: Union[Path, str], exclude: Iterable[str] = tuple()
|
||||
) -> None:
|
||||
"""Load the attributeruler from disk.
|
||||
"""Load the AttributeRuler from disk.
|
||||
|
||||
path (Union[Path, str]): A path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
|
|
|
@ -27,7 +27,6 @@ embed_size = 2000
|
|||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
||||
"""
|
||||
DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
|
||||
|
||||
|
|
|
@ -29,7 +29,6 @@ embed_size = 300
|
|||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
||||
"""
|
||||
DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
|||
assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
|
||||
default_config={
|
||||
"phrase_matcher_attr": None,
|
||||
"validation": False,
|
||||
"validate": False,
|
||||
"overwrite_ents": False,
|
||||
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
||||
},
|
||||
|
@ -31,7 +31,7 @@ def make_entity_ruler(
|
|||
nlp: Language,
|
||||
name: str,
|
||||
phrase_matcher_attr: Optional[Union[int, str]],
|
||||
validation: bool,
|
||||
validate: bool,
|
||||
overwrite_ents: bool,
|
||||
ent_id_sep: str,
|
||||
):
|
||||
|
@ -39,7 +39,7 @@ def make_entity_ruler(
|
|||
nlp,
|
||||
name,
|
||||
phrase_matcher_attr=phrase_matcher_attr,
|
||||
validate=validation,
|
||||
validate=validate,
|
||||
overwrite_ents=overwrite_ents,
|
||||
ent_id_sep=ent_id_sep,
|
||||
)
|
||||
|
|
330
spacy/pipeline/lemmatizer.py
Normal file
330
spacy/pipeline/lemmatizer.py
Normal file
|
@ -0,0 +1,330 @@
|
|||
from typing import Optional, List, Dict, Any
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from .pipe import Pipe
|
||||
from ..errors import Errors
|
||||
from ..language import Language
|
||||
from ..lookups import Lookups, load_lookups
|
||||
from ..scorer import Scorer
|
||||
from ..tokens import Doc, Token
|
||||
from ..vocab import Vocab
|
||||
from .. import util
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "lookup",
|
||||
"lookups": None,
|
||||
"overwrite": False,
|
||||
},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
overwrite: bool = False,
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite
|
||||
)
|
||||
|
||||
|
||||
class Lemmatizer(Pipe):
|
||||
"""
|
||||
The Lemmatizer supports simple part-of-speech-sensitive suffix rules and
|
||||
lookup tables.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def get_lookups_config(cls, mode: str) -> Dict:
|
||||
"""Returns the lookups configuration settings for a given mode for use
|
||||
in Lemmatizer.load_lookups.
|
||||
|
||||
mode (str): The lemmatizer mode.
|
||||
RETURNS (dict): The lookups configuration settings for this mode.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer#get_lookups_config
|
||||
"""
|
||||
if mode == "lookup":
|
||||
return {
|
||||
"required_tables": ["lemma_lookup"],
|
||||
}
|
||||
elif mode == "rule":
|
||||
return {
|
||||
"required_tables": ["lemma_rules"],
|
||||
"optional_tables": ["lemma_exc", "lemma_index"],
|
||||
}
|
||||
return {}
|
||||
|
||||
@classmethod
|
||||
def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups],) -> Lookups:
|
||||
"""Load and validate lookups tables. If the provided lookups is None,
|
||||
load the default lookups tables according to the language and mode
|
||||
settings. Confirm that all required tables for the language and mode
|
||||
are present.
|
||||
|
||||
lang (str): The language code.
|
||||
mode (str): The lemmatizer mode.
|
||||
lookups (Lookups): The provided lookups, may be None if the default
|
||||
lookups should be loaded.
|
||||
RETURNS (Lookups): The Lookups object.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer#get_lookups_config
|
||||
"""
|
||||
config = cls.get_lookups_config(mode)
|
||||
required_tables = config.get("required_tables", [])
|
||||
optional_tables = config.get("optional_tables", [])
|
||||
if lookups is None:
|
||||
lookups = load_lookups(lang=lang, tables=required_tables)
|
||||
optional_lookups = load_lookups(
|
||||
lang=lang, tables=optional_tables, strict=False
|
||||
)
|
||||
for table in optional_lookups.tables:
|
||||
lookups.set_table(table, optional_lookups.get_table(table))
|
||||
for table in required_tables:
|
||||
if table not in lookups:
|
||||
raise ValueError(
|
||||
Errors.E1004.format(
|
||||
mode=mode, tables=required_tables, found=lookups.tables
|
||||
)
|
||||
)
|
||||
return lookups
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Vocab,
|
||||
model: Optional[Model],
|
||||
name: str = "lemmatizer",
|
||||
*,
|
||||
mode: str = "lookup",
|
||||
lookups: Optional[Lookups] = None,
|
||||
overwrite: bool = False,
|
||||
) -> None:
|
||||
"""Initialize a Lemmatizer.
|
||||
|
||||
vocab (Vocab): The vocab.
|
||||
model (Model): A model (not yet implemented).
|
||||
name (str): The component name. Defaults to "lemmatizer".
|
||||
mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
|
||||
lookups (Lookups): The lookups object containing the (optional) tables
|
||||
such as "lemma_rules", "lemma_index", "lemma_exc" and
|
||||
"lemma_lookup". Defaults to None
|
||||
overwrite (bool): Whether to overwrite existing lemmas. Defaults to
|
||||
`False`.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer#init
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self._mode = mode
|
||||
self.lookups = lookups if lookups is not None else Lookups()
|
||||
self.overwrite = overwrite
|
||||
if self.mode == "lookup":
|
||||
self.lemmatize = self.lookup_lemmatize
|
||||
elif self.mode == "rule":
|
||||
self.lemmatize = self.rule_lemmatize
|
||||
else:
|
||||
try:
|
||||
self.lemmatize = getattr(self, f"{self.mode}_lemmatize")
|
||||
except AttributeError:
|
||||
raise ValueError(Errors.E1003.format(mode=mode))
|
||||
self.cache = {}
|
||||
|
||||
@property
|
||||
def mode(self):
|
||||
return self._mode
|
||||
|
||||
def __call__(self, doc: Doc) -> Doc:
|
||||
"""Apply the lemmatizer to one document.
|
||||
|
||||
doc (Doc): The Doc to process.
|
||||
RETURNS (Doc): The processed Doc.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer#call
|
||||
"""
|
||||
for token in doc:
|
||||
if self.overwrite or token.lemma == 0:
|
||||
token.lemma_ = self.lemmatize(token)[0]
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, *, batch_size=128):
|
||||
"""Apply the pipe to a stream of documents. This usually happens under
|
||||
the hood when the nlp object is called on a text and all components are
|
||||
applied to the Doc.
|
||||
|
||||
stream (Iterable[Doc]): A stream of documents.
|
||||
batch_size (int): The number of documents to buffer.
|
||||
YIELDS (Doc): Processed documents in order.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer#pipe
|
||||
"""
|
||||
for doc in stream:
|
||||
doc = self(doc)
|
||||
yield doc
|
||||
|
||||
def lookup_lemmatize(self, token: Token) -> List[str]:
|
||||
"""Lemmatize using a lookup-based approach.
|
||||
|
||||
token (Token): The token to lemmatize.
|
||||
RETURNS (list): The available lemmas for the string.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer#lookup_lemmatize
|
||||
"""
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
result = lookup_table.get(token.text, token.text)
|
||||
if isinstance(result, str):
|
||||
result = [result]
|
||||
return result
|
||||
|
||||
def rule_lemmatize(self, token: Token) -> List[str]:
|
||||
"""Lemmatize using a rule-based approach.
|
||||
|
||||
token (Token): The token to lemmatize.
|
||||
RETURNS (list): The available lemmas for the string.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer#rule_lemmatize
|
||||
"""
|
||||
cache_key = (token.orth, token.pos, token.morph)
|
||||
if cache_key in self.cache:
|
||||
return self.cache[cache_key]
|
||||
string = token.text
|
||||
univ_pos = token.pos_.lower()
|
||||
if univ_pos in ("", "eol", "space"):
|
||||
return [string.lower()]
|
||||
# See Issue #435 for example of where this logic is requied.
|
||||
if self.is_base_form(token):
|
||||
return [string.lower()]
|
||||
index_table = self.lookups.get_table("lemma_index", {})
|
||||
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||
if not any(
|
||||
(
|
||||
index_table.get(univ_pos),
|
||||
exc_table.get(univ_pos),
|
||||
rules_table.get(univ_pos),
|
||||
)
|
||||
):
|
||||
if univ_pos == "propn":
|
||||
return [string]
|
||||
else:
|
||||
return [string.lower()]
|
||||
|
||||
index = index_table.get(univ_pos, {})
|
||||
exceptions = exc_table.get(univ_pos, {})
|
||||
rules = rules_table.get(univ_pos, {})
|
||||
orig = string
|
||||
string = string.lower()
|
||||
forms = []
|
||||
oov_forms = []
|
||||
for old, new in rules:
|
||||
if string.endswith(old):
|
||||
form = string[: len(string) - len(old)] + new
|
||||
if not form:
|
||||
pass
|
||||
elif form in index or not form.isalpha():
|
||||
forms.append(form)
|
||||
else:
|
||||
oov_forms.append(form)
|
||||
# Remove duplicates but preserve the ordering of applied "rules"
|
||||
forms = list(dict.fromkeys(forms))
|
||||
# Put exceptions at the front of the list, so they get priority.
|
||||
# This is a dodgy heuristic -- but it's the best we can do until we get
|
||||
# frequencies on this. We can at least prune out problematic exceptions,
|
||||
# if they shadow more frequent analyses.
|
||||
for form in exceptions.get(string, []):
|
||||
if form not in forms:
|
||||
forms.insert(0, form)
|
||||
if not forms:
|
||||
forms.extend(oov_forms)
|
||||
if not forms:
|
||||
forms.append(orig)
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
|
||||
def is_base_form(self, token: Token) -> bool:
|
||||
"""Check whether the token is a base form that does not need further
|
||||
analysis for lemmatization.
|
||||
|
||||
token (Token): The token.
|
||||
RETURNS (bool): Whether the token is a base form.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer#is_base_form
|
||||
"""
|
||||
return False
|
||||
|
||||
def score(self, examples, **kwargs) -> Dict[str, Any]:
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer#score
|
||||
"""
|
||||
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
||||
|
||||
def to_disk(self, path, *, exclude=tuple()):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#to_disk
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||
serialize["lookups"] = lambda p: self.lookups.to_disk(p)
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, *, exclude=tuple()):
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it.
|
||||
|
||||
path (unicode or Path): A path to a directory.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Vocab): The modified `Vocab` object.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#to_disk
|
||||
"""
|
||||
deserialize = {}
|
||||
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
||||
deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
|
||||
def to_bytes(self, *, exclude=tuple()) -> bytes:
|
||||
"""Serialize the current state to a binary string.
|
||||
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized form of the `Vocab` object.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#to_bytes
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["vocab"] = self.vocab.to_bytes
|
||||
serialize["lookups"] = self.lookups.to_bytes
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data: bytes, *, exclude=tuple()):
|
||||
"""Load state from a binary string.
|
||||
|
||||
bytes_data (bytes): The data to load from.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Vocab): The `Vocab` object.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#from_bytes
|
||||
"""
|
||||
deserialize = {}
|
||||
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
||||
deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
|
||||
util.from_bytes(bytes_data, deserialize, exclude)
|
|
@ -29,7 +29,6 @@ embed_size = 2000
|
|||
window_size = 1
|
||||
maxout_pieces = 2
|
||||
subword_features = true
|
||||
dropout = null
|
||||
"""
|
||||
DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"]
|
||||
|
||||
|
|
|
@ -25,7 +25,6 @@ embed_size = 2000
|
|||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
||||
"""
|
||||
DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
|
||||
|
||||
|
|
|
@ -25,7 +25,6 @@ embed_size = 2000
|
|||
window_size = 1
|
||||
maxout_pieces = 2
|
||||
subword_features = true
|
||||
dropout = null
|
||||
"""
|
||||
DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@ from .pipe import Pipe
|
|||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.BiluoTagger.v1"
|
||||
@architectures = "spacy.BILUOTagger.v1"
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
|
@ -26,7 +26,6 @@ embed_size = 7000
|
|||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
||||
"""
|
||||
DEFAULT_SIMPLE_NER_MODEL = Config().from_str(default_model_config)["model"]
|
||||
|
||||
|
|
|
@ -31,7 +31,6 @@ embed_size = 2000
|
|||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
||||
"""
|
||||
DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
|
||||
|
||||
|
@ -39,12 +38,12 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
@Language.factory(
|
||||
"tagger",
|
||||
assigns=["token.tag"],
|
||||
default_config={"model": DEFAULT_TAGGER_MODEL, "set_morphology": False},
|
||||
scores=["tag_acc", "pos_acc", "lemma_acc"],
|
||||
default_config={"model": DEFAULT_TAGGER_MODEL},
|
||||
scores=["tag_acc"],
|
||||
default_score_weights={"tag_acc": 1.0},
|
||||
)
|
||||
def make_tagger(nlp: Language, name: str, model: Model, set_morphology: bool):
|
||||
return Tagger(nlp.vocab, model, name, set_morphology=set_morphology)
|
||||
def make_tagger(nlp: Language, name: str, model: Model):
|
||||
return Tagger(nlp.vocab, model, name)
|
||||
|
||||
|
||||
class Tagger(Pipe):
|
||||
|
@ -52,13 +51,14 @@ class Tagger(Pipe):
|
|||
|
||||
DOCS: https://spacy.io/api/tagger
|
||||
"""
|
||||
def __init__(self, vocab, model, name="tagger", *, set_morphology=False):
|
||||
def __init__(self, vocab, model, name="tagger", *, labels=None):
|
||||
"""Initialize a part-of-speech tagger.
|
||||
|
||||
vocab (Vocab): The shared vocabulary.
|
||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
labels (List): The set of labels. Defaults to None.
|
||||
set_morphology (bool): Whether to set morphological features.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#init
|
||||
|
@ -67,7 +67,7 @@ class Tagger(Pipe):
|
|||
self.model = model
|
||||
self.name = name
|
||||
self._rehearsal_model = None
|
||||
cfg = {"set_morphology": set_morphology}
|
||||
cfg = {"labels": labels or []}
|
||||
self.cfg = dict(sorted(cfg.items()))
|
||||
|
||||
@property
|
||||
|
@ -80,7 +80,7 @@ class Tagger(Pipe):
|
|||
|
||||
DOCS: https://spacy.io/api/tagger#labels
|
||||
"""
|
||||
return tuple(self.vocab.morphology.tag_names)
|
||||
return tuple(self.cfg["labels"])
|
||||
|
||||
def __call__(self, doc):
|
||||
"""Apply the pipe to a Doc.
|
||||
|
@ -150,9 +150,7 @@ class Tagger(Pipe):
|
|||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
cdef Doc doc
|
||||
cdef int idx = 0
|
||||
cdef Vocab vocab = self.vocab
|
||||
assign_morphology = self.cfg.get("set_morphology", True)
|
||||
for i, doc in enumerate(docs):
|
||||
doc_tag_ids = batch_tag_ids[i]
|
||||
if hasattr(doc_tag_ids, "get"):
|
||||
|
@ -160,15 +158,7 @@ class Tagger(Pipe):
|
|||
for j, tag_id in enumerate(doc_tag_ids):
|
||||
# Don't clobber preset POS tags
|
||||
if doc.c[j].tag == 0:
|
||||
if doc.c[j].pos == 0 and assign_morphology:
|
||||
# Don't clobber preset lemmas
|
||||
lemma = doc.c[j].lemma
|
||||
vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
|
||||
if lemma != 0 and lemma != doc.c[j].lex.orth:
|
||||
doc.c[j].lemma = lemma
|
||||
else:
|
||||
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
|
||||
idx += 1
|
||||
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
|
||||
doc.is_tagged = True
|
||||
|
||||
def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
|
||||
|
@ -279,55 +269,26 @@ class Tagger(Pipe):
|
|||
|
||||
DOCS: https://spacy.io/api/tagger#begin_training
|
||||
"""
|
||||
lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
|
||||
if not any(table in self.vocab.lookups for table in lemma_tables):
|
||||
warnings.warn(Warnings.W022)
|
||||
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
|
||||
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
|
||||
langs = ", ".join(util.LEXEME_NORM_LANGS)
|
||||
warnings.warn(Warnings.W033.format(model="part-of-speech tagger", langs=langs))
|
||||
orig_tag_map = dict(self.vocab.morphology.tag_map)
|
||||
new_tag_map = {}
|
||||
tags = set()
|
||||
for example in get_examples():
|
||||
try:
|
||||
y = example.y
|
||||
except AttributeError:
|
||||
raise TypeError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example))) from None
|
||||
for token in y:
|
||||
tag = token.tag_
|
||||
if tag in orig_tag_map:
|
||||
new_tag_map[tag] = orig_tag_map[tag]
|
||||
else:
|
||||
new_tag_map[tag] = {POS: X}
|
||||
|
||||
cdef Vocab vocab = self.vocab
|
||||
if new_tag_map:
|
||||
if "_SP" in orig_tag_map:
|
||||
new_tag_map["_SP"] = orig_tag_map["_SP"]
|
||||
vocab.morphology.load_tag_map(new_tag_map)
|
||||
tags.add(token.tag_)
|
||||
for tag in sorted(tags):
|
||||
self.add_label(tag)
|
||||
self.set_output(len(self.labels))
|
||||
doc_sample = [Doc(self.vocab, words=["hello", "world"])]
|
||||
if pipeline is not None:
|
||||
for name, component in pipeline:
|
||||
if component is self:
|
||||
break
|
||||
if hasattr(component, "pipe"):
|
||||
doc_sample = list(component.pipe(doc_sample))
|
||||
else:
|
||||
doc_sample = [component(doc) for doc in doc_sample]
|
||||
self.model.initialize(X=doc_sample)
|
||||
# Get batch of example docs, example outputs to call begin_training().
|
||||
# This lets the model infer shapes.
|
||||
self.model.initialize()
|
||||
if sgd is None:
|
||||
sgd = self.create_optimizer()
|
||||
return sgd
|
||||
|
||||
def add_label(self, label, values=None):
|
||||
def add_label(self, label):
|
||||
"""Add a new label to the pipe.
|
||||
|
||||
label (str): The label to add.
|
||||
values (Dict[int, str]): Optional values to map to the label, e.g. a
|
||||
tag map dictionary.
|
||||
RETURNS (int): 0 if label is already present, otherwise 1.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#add_label
|
||||
|
@ -336,22 +297,8 @@ class Tagger(Pipe):
|
|||
raise ValueError(Errors.E187)
|
||||
if label in self.labels:
|
||||
return 0
|
||||
if self.model.has_dim("nO"):
|
||||
# Here's how the model resizing will work, once the
|
||||
# neuron-to-tag mapping is no longer controlled by
|
||||
# the Morphology class, which sorts the tag names.
|
||||
# The sorting makes adding labels difficult.
|
||||
# smaller = self.model._layers[-1]
|
||||
# larger = Softmax(len(self.labels)+1, smaller.nI)
|
||||
# copy_array(larger.W[:smaller.nO], smaller.W)
|
||||
# copy_array(larger.b[:smaller.nO], smaller.b)
|
||||
# self.model._layers[-1] = larger
|
||||
raise ValueError(TempErrors.T003)
|
||||
tag_map = dict(self.vocab.morphology.tag_map)
|
||||
if values is None:
|
||||
values = {POS: "X"}
|
||||
tag_map[label] = values
|
||||
self.vocab.morphology.load_tag_map(tag_map)
|
||||
self.cfg["labels"].append(label)
|
||||
self.vocab.strings.add(label)
|
||||
return 1
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
|
@ -363,11 +310,7 @@ class Tagger(Pipe):
|
|||
|
||||
DOCS: https://spacy.io/api/tagger#score
|
||||
"""
|
||||
scores = {}
|
||||
scores.update(Scorer.score_token_attr(examples, "tag", **kwargs))
|
||||
scores.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||
scores.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
|
||||
return scores
|
||||
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
||||
|
||||
def to_bytes(self, *, exclude=tuple()):
|
||||
"""Serialize the pipe to a bytestring.
|
||||
|
@ -381,10 +324,6 @@ class Tagger(Pipe):
|
|||
serialize["model"] = self.model.to_bytes
|
||||
serialize["vocab"] = self.vocab.to_bytes
|
||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||
tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
|
||||
serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map)
|
||||
morph_rules = dict(self.vocab.morphology.exc)
|
||||
serialize["morph_rules"] = lambda: srsly.msgpack_dumps(morph_rules)
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
||||
|
@ -402,21 +341,8 @@ class Tagger(Pipe):
|
|||
except AttributeError:
|
||||
raise ValueError(Errors.E149) from None
|
||||
|
||||
def load_tag_map(b):
|
||||
tag_map = srsly.msgpack_loads(b)
|
||||
self.vocab.morphology.load_tag_map(tag_map)
|
||||
|
||||
def load_morph_rules(b):
|
||||
morph_rules = srsly.msgpack_loads(b)
|
||||
self.vocab.morphology.load_morph_exceptions(morph_rules)
|
||||
|
||||
self.vocab.morphology = Morphology(self.vocab.strings, dict(),
|
||||
lemmatizer=self.vocab.morphology.lemmatizer)
|
||||
|
||||
deserialize = {
|
||||
"vocab": lambda b: self.vocab.from_bytes(b),
|
||||
"tag_map": load_tag_map,
|
||||
"morph_rules": load_morph_rules,
|
||||
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
||||
"model": lambda b: load_model(b),
|
||||
}
|
||||
|
@ -431,12 +357,8 @@ class Tagger(Pipe):
|
|||
|
||||
DOCS: https://spacy.io/api/tagger#to_disk
|
||||
"""
|
||||
tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
|
||||
morph_rules = dict(self.vocab.morphology.exc)
|
||||
serialize = {
|
||||
"vocab": lambda p: self.vocab.to_disk(p),
|
||||
"tag_map": lambda p: srsly.write_msgpack(p, tag_map),
|
||||
"morph_rules": lambda p: srsly.write_msgpack(p, morph_rules),
|
||||
"model": lambda p: self.model.to_disk(p),
|
||||
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
||||
}
|
||||
|
@ -458,22 +380,9 @@ class Tagger(Pipe):
|
|||
except AttributeError:
|
||||
raise ValueError(Errors.E149) from None
|
||||
|
||||
def load_tag_map(p):
|
||||
tag_map = srsly.read_msgpack(p)
|
||||
self.vocab.morphology.load_tag_map(tag_map)
|
||||
|
||||
def load_morph_rules(p):
|
||||
morph_rules = srsly.read_msgpack(p)
|
||||
self.vocab.morphology.load_morph_exceptions(morph_rules)
|
||||
|
||||
self.vocab.morphology = Morphology(self.vocab.strings, dict(),
|
||||
lemmatizer=self.vocab.morphology.lemmatizer)
|
||||
|
||||
deserialize = {
|
||||
"vocab": lambda p: self.vocab.from_disk(p),
|
||||
"cfg": lambda p: self.cfg.update(deserialize_config(p)),
|
||||
"tag_map": load_tag_map,
|
||||
"morph_rules": load_morph_rules,
|
||||
"model": load_model,
|
||||
}
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
|
|
|
@ -30,8 +30,8 @@ bow_model_config = """
|
|||
[model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
ngram_size: 1
|
||||
no_output_layer: false
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
"""
|
||||
|
||||
cnn_model_config = """
|
||||
|
@ -48,7 +48,6 @@ embed_size = 2000
|
|||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
||||
"""
|
||||
|
||||
|
||||
|
|
|
@ -20,7 +20,6 @@ embed_size = 2000
|
|||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
||||
"""
|
||||
DEFAULT_TOK2VEC_MODEL = Config().from_str(default_model_config)["model"]
|
||||
|
||||
|
|
|
@ -220,7 +220,6 @@ class ConfigSchemaNlp(BaseModel):
|
|||
lang: StrictStr = Field(..., title="The base language to use")
|
||||
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
|
||||
tokenizer: Callable = Field(..., title="The tokenizer to use")
|
||||
lemmatizer: Callable = Field(..., title="The lemmatizer to use")
|
||||
load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
|
||||
before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
|
||||
after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
|
||||
|
|
144
spacy/scorer.py
144
spacy/scorer.py
|
@ -242,7 +242,8 @@ class Scorer:
|
|||
per_feat[field].score_set(
|
||||
pred_per_feat.get(field, set()), gold_per_feat.get(field, set()),
|
||||
)
|
||||
return {f"{attr}_per_feat": per_feat}
|
||||
result = {k: v.to_dict() for k, v in per_feat.items()}
|
||||
return {f"{attr}_per_feat": result}
|
||||
|
||||
@staticmethod
|
||||
def score_spans(
|
||||
|
@ -318,6 +319,7 @@ class Scorer:
|
|||
labels: Iterable[str] = tuple(),
|
||||
multi_label: bool = True,
|
||||
positive_label: Optional[str] = None,
|
||||
threshold: Optional[float] = None,
|
||||
**cfg,
|
||||
) -> Dict[str, Any]:
|
||||
"""Returns PRF and ROC AUC scores for a doc-level attribute with a
|
||||
|
@ -333,94 +335,104 @@ class Scorer:
|
|||
Defaults to True.
|
||||
positive_label (str): The positive label for a binary task with
|
||||
exclusive classes. Defaults to None.
|
||||
threshold (float): Cutoff to consider a prediction "positive". Defaults
|
||||
to 0.5 for multi-label, and 0.0 (i.e. whatever's highest scoring)
|
||||
otherwise.
|
||||
RETURNS (Dict[str, Any]): A dictionary containing the scores, with
|
||||
inapplicable scores as None:
|
||||
for all:
|
||||
attr_score (one of attr_f / attr_macro_f / attr_macro_auc),
|
||||
attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
|
||||
attr_score_desc (text description of the overall score),
|
||||
attr_micro_f,
|
||||
attr_macro_f,
|
||||
attr_auc,
|
||||
attr_f_per_type,
|
||||
attr_auc_per_type
|
||||
for binary exclusive with positive label: attr_p/r/f
|
||||
for 3+ exclusive classes, macro-averaged fscore: attr_macro_f
|
||||
for multilabel, macro-averaged AUC: attr_macro_auc
|
||||
|
||||
DOCS: https://spacy.io/api/scorer#score_cats
|
||||
"""
|
||||
score = PRFScore()
|
||||
f_per_type = dict()
|
||||
auc_per_type = dict()
|
||||
for label in labels:
|
||||
f_per_type[label] = PRFScore()
|
||||
auc_per_type[label] = ROCAUCScore()
|
||||
if threshold is None:
|
||||
threshold = 0.5 if multi_label else 0.0
|
||||
f_per_type = {label: PRFScore() for label in labels}
|
||||
auc_per_type = {label: ROCAUCScore() for label in labels}
|
||||
labels = set(labels)
|
||||
if labels:
|
||||
for eg in examples:
|
||||
labels.update(eg.predicted.cats.keys())
|
||||
labels.update(eg.reference.cats.keys())
|
||||
for example in examples:
|
||||
gold_doc = example.reference
|
||||
pred_doc = example.predicted
|
||||
gold_values = getter(gold_doc, attr)
|
||||
pred_values = getter(pred_doc, attr)
|
||||
if (
|
||||
len(gold_values) > 0
|
||||
and set(f_per_type) == set(auc_per_type) == set(gold_values)
|
||||
and set(gold_values) == set(pred_values)
|
||||
):
|
||||
gold_val = max(gold_values, key=gold_values.get)
|
||||
pred_val = max(pred_values, key=pred_values.get)
|
||||
if positive_label:
|
||||
score.score_set(
|
||||
set([positive_label]) & set([pred_val]),
|
||||
set([positive_label]) & set([gold_val]),
|
||||
)
|
||||
for label in set(gold_values):
|
||||
auc_per_type[label].score_set(
|
||||
pred_values[label], gold_values[label]
|
||||
)
|
||||
f_per_type[label].score_set(
|
||||
set([label]) & set([pred_val]), set([label]) & set([gold_val])
|
||||
)
|
||||
elif len(f_per_type) > 0:
|
||||
model_labels = set(f_per_type)
|
||||
eval_labels = set(gold_values)
|
||||
raise ValueError(
|
||||
Errors.E162.format(
|
||||
model_labels=model_labels, eval_labels=eval_labels
|
||||
)
|
||||
)
|
||||
elif len(auc_per_type) > 0:
|
||||
model_labels = set(auc_per_type)
|
||||
eval_labels = set(gold_values)
|
||||
raise ValueError(
|
||||
Errors.E162.format(
|
||||
model_labels=model_labels, eval_labels=eval_labels
|
||||
)
|
||||
)
|
||||
# Through this loop, None in the gold_cats indicates missing label.
|
||||
pred_cats = getter(example.predicted, attr)
|
||||
gold_cats = getter(example.reference, attr)
|
||||
|
||||
# I think the AUC metric is applicable regardless of whether we're
|
||||
# doing multi-label classification? Unsure. If not, move this into
|
||||
# the elif pred_cats and gold_cats block below.
|
||||
for label in labels:
|
||||
pred_score = pred_cats.get(label, 0.0)
|
||||
gold_score = gold_cats.get(label, 0.0)
|
||||
if gold_score is not None:
|
||||
auc_per_type[label].score_set(pred_score, gold_score)
|
||||
if multi_label:
|
||||
for label in labels:
|
||||
pred_score = pred_cats.get(label, 0.0)
|
||||
gold_score = gold_cats.get(label, 0.0)
|
||||
if gold_score is not None:
|
||||
if pred_score >= threshold and gold_score > 0:
|
||||
f_per_type[label].tp += 1
|
||||
elif pred_score >= threshold and gold_score == 0:
|
||||
f_per_type[label].fp += 1
|
||||
elif pred_score < threshold and gold_score > 0:
|
||||
f_per_type[label].fn += 1
|
||||
elif pred_cats and gold_cats:
|
||||
# Get the highest-scoring for each.
|
||||
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
|
||||
gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
|
||||
if gold_score is not None:
|
||||
if pred_label == gold_label and pred_score >= threshold:
|
||||
f_per_type[pred_label].tp += 1
|
||||
else:
|
||||
f_per_type[gold_label].fn += 1
|
||||
if pred_score >= threshold:
|
||||
f_per_type[pred_label].fp += 1
|
||||
elif gold_cats:
|
||||
gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
|
||||
if gold_score is not None and gold_score > 0:
|
||||
f_per_type[gold_label].fn += 1
|
||||
else:
|
||||
pred_label, pred_score = max(pred_cats, key=lambda it: it[1])
|
||||
if pred_score >= threshold:
|
||||
f_per_type[pred_label].fp += 1
|
||||
micro_prf = PRFScore()
|
||||
for label_prf in f_per_type.values():
|
||||
micro_prf.tp = label_prf.tp
|
||||
micro_prf.fn = label_prf.fn
|
||||
micro_prf.fp = label_prf.fp
|
||||
n_cats = len(f_per_type) + 1e-100
|
||||
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
|
||||
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
|
||||
macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
|
||||
results = {
|
||||
f"{attr}_score": None,
|
||||
f"{attr}_score_desc": None,
|
||||
f"{attr}_p": None,
|
||||
f"{attr}_r": None,
|
||||
f"{attr}_f": None,
|
||||
f"{attr}_macro_f": None,
|
||||
f"{attr}_micro_p": micro_prf.precision,
|
||||
f"{attr}_micro_r": micro_prf.recall,
|
||||
f"{attr}_micro_f": micro_prf.fscore,
|
||||
f"{attr}_macro_p": macro_p,
|
||||
f"{attr}_macro_r": macro_r,
|
||||
f"{attr}_macro_f": macro_f,
|
||||
f"{attr}_macro_auc": None,
|
||||
f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
|
||||
f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
|
||||
}
|
||||
if len(labels) == 2 and not multi_label and positive_label:
|
||||
results[f"{attr}_p"] = score.precision
|
||||
results[f"{attr}_r"] = score.recall
|
||||
results[f"{attr}_f"] = score.fscore
|
||||
results[f"{attr}_score"] = results[f"{attr}_f"]
|
||||
positive_label_f = results[f"{attr}_f_per_type"][positive_label]['f']
|
||||
results[f"{attr}_score"] = positive_label_f
|
||||
results[f"{attr}_score_desc"] = f"F ({positive_label})"
|
||||
elif not multi_label:
|
||||
results[f"{attr}_macro_f"] = sum(
|
||||
[score.fscore for label, score in f_per_type.items()]
|
||||
) / (len(f_per_type) + 1e-100)
|
||||
results[f"{attr}_score"] = results[f"{attr}_macro_f"]
|
||||
results[f"{attr}_score_desc"] = "macro F"
|
||||
else:
|
||||
results[f"{attr}_macro_auc"] = max(
|
||||
sum([score.score for label, score in auc_per_type.items()])
|
||||
/ (len(auc_per_type) + 1e-100),
|
||||
-1,
|
||||
)
|
||||
results[f"{attr}_score"] = results[f"{attr}_macro_auc"]
|
||||
results[f"{attr}_score_desc"] = "macro AUC"
|
||||
return results
|
||||
|
|
|
@ -201,7 +201,7 @@ def ru_tokenizer():
|
|||
@pytest.fixture
|
||||
def ru_lemmatizer():
|
||||
pytest.importorskip("pymorphy2")
|
||||
return get_lang_class("ru")().vocab.morphology.lemmatizer
|
||||
return get_lang_class("ru")().add_pipe("lemmatizer")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
|
|
@ -1,21 +1,12 @@
|
|||
import pytest
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.tokens import Doc
|
||||
from spacy.lemmatizer import Lemmatizer
|
||||
from spacy.lookups import Lookups
|
||||
from spacy import util
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lemmatizer():
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_lookup", {"dogs": "dog", "boxen": "box", "mice": "mouse"})
|
||||
return Lemmatizer(lookups)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vocab(lemmatizer):
|
||||
return Vocab(lemmatizer=lemmatizer)
|
||||
def vocab():
|
||||
return Vocab()
|
||||
|
||||
|
||||
def test_empty_doc(vocab):
|
||||
|
@ -30,14 +21,6 @@ def test_single_word(vocab):
|
|||
assert doc.text == "a"
|
||||
|
||||
|
||||
def test_lookup_lemmatization(vocab):
|
||||
doc = Doc(vocab, words=["dogs", "dogses"])
|
||||
assert doc[0].text == "dogs"
|
||||
assert doc[0].lemma_ == "dog"
|
||||
assert doc[1].text == "dogses"
|
||||
assert doc[1].lemma_ == "dogses"
|
||||
|
||||
|
||||
def test_create_from_words_and_text(vocab):
|
||||
# no whitespace in words
|
||||
words = ["'", "dogs", "'", "run"]
|
||||
|
|
|
@ -1,23 +1,17 @@
|
|||
import pytest
|
||||
from spacy.symbols import POS, PRON, VERB
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def i_has(en_tokenizer):
|
||||
doc = en_tokenizer("I has")
|
||||
tag_map = {
|
||||
"PRP": {POS: PRON, "PronType": "prs"},
|
||||
"VBZ": {
|
||||
POS: VERB,
|
||||
"VerbForm": "fin",
|
||||
"Tense": "pres",
|
||||
"Number": "sing",
|
||||
"Person": "three",
|
||||
},
|
||||
doc[0].morph_ = {"PronType": "prs"}
|
||||
doc[1].morph_ = {
|
||||
"VerbForm": "fin",
|
||||
"Tense": "pres",
|
||||
"Number": "sing",
|
||||
"Person": "three",
|
||||
}
|
||||
en_tokenizer.vocab.morphology.load_tag_map(tag_map)
|
||||
doc[0].tag_ = "PRP"
|
||||
doc[1].tag_ = "VBZ"
|
||||
|
||||
return doc
|
||||
|
||||
|
||||
|
|
|
@ -124,7 +124,6 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
|
|||
assert doc[0].text == "The players"
|
||||
assert doc[0].tag_ == "NN"
|
||||
assert doc[0].pos_ == "NOUN"
|
||||
assert doc[0].lemma_ == "The players"
|
||||
doc = get_doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
|
@ -143,11 +142,9 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
|
|||
assert doc[0].text == "The players"
|
||||
assert doc[0].tag_ == "NN"
|
||||
assert doc[0].pos_ == "NOUN"
|
||||
assert doc[0].lemma_ == "The players"
|
||||
assert doc[1].text == "start ."
|
||||
assert doc[1].tag_ == "VBZ"
|
||||
assert doc[1].pos_ == "VERB"
|
||||
assert doc[1].lemma_ == "start ."
|
||||
|
||||
|
||||
def test_doc_retokenize_spans_merge_heads(en_tokenizer):
|
||||
|
|
|
@ -1,21 +0,0 @@
|
|||
from spacy.symbols import POS, PRON, VERB, DET, NOUN, PUNCT
|
||||
from ...util import get_doc
|
||||
|
||||
|
||||
def test_en_tagger_load_morph_exc(en_tokenizer):
|
||||
text = "I like his style."
|
||||
tags = ["PRP", "VBP", "PRP$", "NN", "."]
|
||||
tag_map = {
|
||||
"PRP": {POS: PRON},
|
||||
"VBP": {POS: VERB},
|
||||
"PRP$": {POS: DET},
|
||||
"NN": {POS: NOUN},
|
||||
".": {POS: PUNCT},
|
||||
}
|
||||
morph_exc = {"VBP": {"like": {"lemma": "luck"}}}
|
||||
en_tokenizer.vocab.morphology.load_tag_map(tag_map)
|
||||
en_tokenizer.vocab.morphology.load_morph_exceptions(morph_exc)
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags)
|
||||
assert doc[1].tag_ == "VBP"
|
||||
assert doc[1].lemma_ == "luck"
|
|
@ -3,15 +3,16 @@ import pytest
|
|||
from ...util import get_doc
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="TODO: investigate why lemmatizer fails here")
|
||||
def test_ru_doc_lemmatization(ru_tokenizer):
|
||||
def test_ru_doc_lemmatization(ru_lemmatizer):
|
||||
words = ["мама", "мыла", "раму"]
|
||||
tags = [
|
||||
"NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
|
||||
"VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
|
||||
"NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
|
||||
pos = ["NOUN", "VERB", "NOUN"]
|
||||
morphs = [
|
||||
"Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
|
||||
"Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
|
||||
"Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
|
||||
]
|
||||
doc = get_doc(ru_tokenizer.vocab, words=words, tags=tags)
|
||||
doc = get_doc(ru_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
|
||||
doc = ru_lemmatizer(doc)
|
||||
lemmas = [token.lemma_ for token in doc]
|
||||
assert lemmas == ["мама", "мыть", "рама"]
|
||||
|
||||
|
@ -27,43 +28,51 @@ def test_ru_doc_lemmatization(ru_tokenizer):
|
|||
],
|
||||
)
|
||||
def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
|
||||
assert sorted(ru_lemmatizer.noun(text)) == lemmas
|
||||
doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"])
|
||||
result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
|
||||
assert sorted(result_lemmas) == lemmas
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,pos,morphology,lemma",
|
||||
"text,pos,morph,lemma",
|
||||
[
|
||||
("рой", "NOUN", None, "рой"),
|
||||
("рой", "VERB", None, "рыть"),
|
||||
("клей", "NOUN", None, "клей"),
|
||||
("клей", "VERB", None, "клеить"),
|
||||
("три", "NUM", None, "три"),
|
||||
("кос", "NOUN", {"Number": "Sing"}, "кос"),
|
||||
("кос", "NOUN", {"Number": "Plur"}, "коса"),
|
||||
("кос", "ADJ", None, "косой"),
|
||||
("потом", "NOUN", None, "пот"),
|
||||
("потом", "ADV", None, "потом"),
|
||||
("рой", "NOUN", "", "рой"),
|
||||
("рой", "VERB", "", "рыть"),
|
||||
("клей", "NOUN", "", "клей"),
|
||||
("клей", "VERB", "", "клеить"),
|
||||
("три", "NUM", "", "три"),
|
||||
("кос", "NOUN", "Number=Sing", "кос"),
|
||||
("кос", "NOUN", "Number=Plur", "коса"),
|
||||
("кос", "ADJ", "", "косой"),
|
||||
("потом", "NOUN", "", "пот"),
|
||||
("потом", "ADV", "", "потом"),
|
||||
],
|
||||
)
|
||||
def test_ru_lemmatizer_works_with_different_pos_homonyms(
|
||||
ru_lemmatizer, text, pos, morphology, lemma
|
||||
ru_lemmatizer, text, pos, morph, lemma
|
||||
):
|
||||
assert ru_lemmatizer(text, pos, morphology) == [lemma]
|
||||
doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=[pos], morphs=[morph])
|
||||
result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
|
||||
assert result_lemmas == [lemma]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,morphology,lemma",
|
||||
"text,morph,lemma",
|
||||
[
|
||||
("гвоздики", {"Gender": "Fem"}, "гвоздика"),
|
||||
("гвоздики", {"Gender": "Masc"}, "гвоздик"),
|
||||
("вина", {"Gender": "Fem"}, "вина"),
|
||||
("вина", {"Gender": "Neut"}, "вино"),
|
||||
("гвоздики", "Gender=Fem", "гвоздика"),
|
||||
("гвоздики", "Gender=Masc", "гвоздик"),
|
||||
("вина", "Gender=Fem", "вина"),
|
||||
("вина", "Gender=Neut", "вино"),
|
||||
],
|
||||
)
|
||||
def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morphology, lemma):
|
||||
assert ru_lemmatizer.noun(text, morphology) == [lemma]
|
||||
def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morph, lemma):
|
||||
doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"], morphs=[morph])
|
||||
result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
|
||||
assert result_lemmas == [lemma]
|
||||
|
||||
|
||||
def test_ru_lemmatizer_punct(ru_lemmatizer):
|
||||
assert ru_lemmatizer.punct("«") == ['"']
|
||||
assert ru_lemmatizer.punct("»") == ['"']
|
||||
doc = get_doc(ru_lemmatizer.vocab, words=["«"], pos=["PUNCT"])
|
||||
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
|
||||
doc = get_doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
|
||||
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
|
||||
|
|
34
spacy/tests/lang/test_lemmatizers.py
Normal file
34
spacy/tests/lang/test_lemmatizers.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
import pytest
|
||||
from spacy import registry
|
||||
from spacy.lookups import Lookups
|
||||
from spacy.util import get_lang_class
|
||||
|
||||
|
||||
# fmt: off
|
||||
# Only include languages with no external dependencies
|
||||
# excluded: ru, uk
|
||||
# excluded for custom tables: pl
|
||||
LANGUAGES = ["el", "en", "fr", "nl"]
|
||||
# fmt: on
|
||||
|
||||
|
||||
@pytest.mark.parametrize("lang", LANGUAGES)
|
||||
def test_lemmatizer_initialize(lang, capfd):
|
||||
@registry.assets("lemmatizer_init_lookups")
|
||||
def lemmatizer_init_lookups():
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
||||
return lookups
|
||||
|
||||
"""Test that languages can be initialized."""
|
||||
nlp = get_lang_class(lang)()
|
||||
nlp.add_pipe(
|
||||
"lemmatizer", config={"lookups": {"@assets": "lemmatizer_init_lookups"}}
|
||||
)
|
||||
# Check for stray print statements (see #3342)
|
||||
doc = nlp("test") # noqa: F841
|
||||
captured = capfd.readouterr()
|
||||
assert not captured.out
|
|
@ -1,14 +1,11 @@
|
|||
import pytest
|
||||
from spacy.morphology import Morphology
|
||||
from spacy.strings import StringStore, get_string_id
|
||||
from spacy.lemmatizer import Lemmatizer
|
||||
from spacy.lookups import Lookups
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def morphology():
|
||||
lemmatizer = Lemmatizer(Lookups())
|
||||
return Morphology(StringStore(), {}, lemmatizer)
|
||||
return Morphology(StringStore())
|
||||
|
||||
|
||||
def test_init(morphology):
|
||||
|
|
|
@ -2,21 +2,18 @@ import pytest
|
|||
import pickle
|
||||
from spacy.morphology import Morphology
|
||||
from spacy.strings import StringStore
|
||||
from spacy.lemmatizer import Lemmatizer
|
||||
from spacy.lookups import Lookups
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def morphology():
|
||||
tag_map = {"A": {"POS": "X"}, "B": {"POS": "NOUN"}}
|
||||
exc = {"A": {"a": {"POS": "VERB"}}}
|
||||
lemmatizer = Lemmatizer(Lookups())
|
||||
return Morphology(StringStore(), tag_map, lemmatizer, exc=exc)
|
||||
morphology = Morphology(StringStore())
|
||||
morphology.add("Feat1=Val1|Feat2=Val2")
|
||||
morphology.add("Feat3=Val3|Feat4=Val4")
|
||||
return morphology
|
||||
|
||||
|
||||
def test_morphology_pickle_roundtrip(morphology):
|
||||
b = pickle.dumps(morphology)
|
||||
reloaded_morphology = pickle.loads(b)
|
||||
|
||||
assert morphology.tag_map == reloaded_morphology.tag_map
|
||||
assert morphology.exc == reloaded_morphology.exc
|
||||
assert reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"]) == "Feat1=Val1|Feat2=Val2"
|
||||
assert reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"]) == "Feat3=Val3|Feat4=Val4"
|
||||
|
|
|
@ -82,10 +82,10 @@ def test_parser_merge_pp(en_tokenizer):
|
|||
text = "A phrase with another phrase occurs"
|
||||
heads = [1, 4, -1, 1, -2, 0]
|
||||
deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT"]
|
||||
tags = ["DT", "NN", "IN", "DT", "NN", "VBZ"]
|
||||
pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB"]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags
|
||||
tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos,
|
||||
)
|
||||
with doc.retokenize() as retokenizer:
|
||||
for np in doc.noun_chunks:
|
||||
|
|
109
spacy/tests/pipeline/test_lemmatizer.py
Normal file
109
spacy/tests/pipeline/test_lemmatizer.py
Normal file
|
@ -0,0 +1,109 @@
|
|||
import pytest
|
||||
|
||||
from spacy import util, registry
|
||||
from spacy.lang.en import English
|
||||
from spacy.lookups import Lookups, load_lookups
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def nlp():
|
||||
return English()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lemmatizer(nlp):
|
||||
@registry.assets("cope_lookups")
|
||||
def cope_lookups():
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
||||
return lookups
|
||||
|
||||
lemmatizer = nlp.add_pipe(
|
||||
"lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}}
|
||||
)
|
||||
return lemmatizer
|
||||
|
||||
|
||||
def test_lemmatizer_init(nlp):
|
||||
@registry.assets("cope_lookups")
|
||||
def cope_lookups():
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
||||
return lookups
|
||||
|
||||
lemmatizer = nlp.add_pipe(
|
||||
"lemmatizer", config={"mode": "lookup", "lookups": {"@assets": "cope_lookups"}}
|
||||
)
|
||||
assert isinstance(lemmatizer.lookups, Lookups)
|
||||
assert lemmatizer.mode == "lookup"
|
||||
# replace any tables from spacy-lookups-data
|
||||
lemmatizer.lookups = Lookups()
|
||||
doc = nlp("coping")
|
||||
# lookup with no tables sets text as lemma
|
||||
assert doc[0].lemma_ == "coping"
|
||||
|
||||
nlp.remove_pipe("lemmatizer")
|
||||
|
||||
@registry.assets("empty_lookups")
|
||||
def empty_lookups():
|
||||
return Lookups()
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
nlp.add_pipe(
|
||||
"lemmatizer",
|
||||
config={"mode": "lookup", "lookups": {"@assets": "empty_lookups"}},
|
||||
)
|
||||
|
||||
|
||||
def test_lemmatizer_config(nlp, lemmatizer):
|
||||
doc = nlp.make_doc("coping")
|
||||
doc[0].pos_ = "VERB"
|
||||
assert doc[0].lemma_ == ""
|
||||
doc = lemmatizer(doc)
|
||||
assert doc[0].text == "coping"
|
||||
assert doc[0].lemma_ == "cope"
|
||||
|
||||
doc = nlp.make_doc("coping")
|
||||
doc[0].pos_ = "VERB"
|
||||
assert doc[0].lemma_ == ""
|
||||
doc = lemmatizer(doc)
|
||||
assert doc[0].text == "coping"
|
||||
assert doc[0].lemma_ == "cope"
|
||||
|
||||
|
||||
def test_lemmatizer_serialize(nlp, lemmatizer):
|
||||
@registry.assets("cope_lookups")
|
||||
def cope_lookups():
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
||||
return lookups
|
||||
|
||||
nlp2 = English()
|
||||
lemmatizer2 = nlp2.add_pipe(
|
||||
"lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}}
|
||||
)
|
||||
lemmatizer2.from_bytes(lemmatizer.to_bytes())
|
||||
assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
|
||||
assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables
|
||||
|
||||
# Also test the results are still the same after IO
|
||||
with make_tempdir() as tmp_dir:
|
||||
nlp.to_disk(tmp_dir)
|
||||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
doc2 = nlp2.make_doc("coping")
|
||||
doc2[0].pos_ = "VERB"
|
||||
assert doc2[0].lemma_ == ""
|
||||
doc2 = lemmatizer(doc2)
|
||||
assert doc2[0].text == "coping"
|
||||
assert doc2[0].lemma_ == "cope"
|
|
@ -23,13 +23,12 @@ def test_tagger_begin_training_tag_map():
|
|||
nlp = Language()
|
||||
tagger = nlp.add_pipe("tagger")
|
||||
orig_tag_count = len(tagger.labels)
|
||||
tagger.add_label("A", {"POS": "NOUN"})
|
||||
tagger.add_label("A")
|
||||
nlp.begin_training()
|
||||
assert nlp.vocab.morphology.tag_map["A"] == {POS: NOUN}
|
||||
assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
|
||||
|
||||
|
||||
TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}}
|
||||
TAGS = ("N", "V", "J")
|
||||
|
||||
MORPH_RULES = {"V": {"like": {"lemma": "luck"}}}
|
||||
|
||||
|
@ -42,15 +41,12 @@ TRAIN_DATA = [
|
|||
def test_overfitting_IO():
|
||||
# Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
|
||||
nlp = English()
|
||||
nlp.vocab.morphology.load_tag_map(TAG_MAP)
|
||||
nlp.vocab.morphology.load_morph_exceptions(MORPH_RULES)
|
||||
tagger = nlp.add_pipe("tagger", config={"set_morphology": True})
|
||||
nlp.vocab.morphology.load_tag_map(TAG_MAP)
|
||||
tagger = nlp.add_pipe("tagger")
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
for tag, values in TAG_MAP.items():
|
||||
tagger.add_label(tag, values)
|
||||
for tag in TAGS:
|
||||
tagger.add_label(tag)
|
||||
optimizer = nlp.begin_training()
|
||||
|
||||
for i in range(50):
|
||||
|
@ -65,7 +61,6 @@ def test_overfitting_IO():
|
|||
assert doc[1].tag_ is "V"
|
||||
assert doc[2].tag_ is "J"
|
||||
assert doc[3].tag_ is "N"
|
||||
assert doc[1].lemma_ == "luck"
|
||||
|
||||
# Also test the results are still the same after IO
|
||||
with make_tempdir() as tmp_dir:
|
||||
|
@ -76,4 +71,3 @@ def test_overfitting_IO():
|
|||
assert doc2[1].tag_ is "V"
|
||||
assert doc2[2].tag_ is "J"
|
||||
assert doc2[3].tag_ is "N"
|
||||
assert doc[1].lemma_ == "luck"
|
||||
|
|
|
@ -117,8 +117,10 @@ def test_overfitting_IO():
|
|||
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
|
||||
|
||||
# Test scoring
|
||||
scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
|
||||
assert scores["cats_f"] == 1.0
|
||||
scores = nlp.evaluate(
|
||||
train_examples, scorer_cfg={"positive_label": "POSITIVE"}
|
||||
)
|
||||
assert scores["cats_micro_f"] == 1.0
|
||||
assert scores["cats_score"] == 1.0
|
||||
assert "cats_score_desc" in scores
|
||||
|
||||
|
|
|
@ -8,10 +8,8 @@ from spacy.attrs import IS_PUNCT, ORTH, LOWER
|
|||
from spacy.symbols import POS, VERB
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.lang.en import English
|
||||
from spacy.lemmatizer import Lemmatizer
|
||||
from spacy.lookups import Lookups
|
||||
from spacy.tokens import Doc, Span
|
||||
from spacy.lang.en.lemmatizer import is_base_form
|
||||
|
||||
from ..util import get_doc, make_tempdir
|
||||
|
||||
|
@ -157,16 +155,15 @@ def test_issue590(en_vocab):
|
|||
assert len(matches) == 2
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Old vocab-based lemmatization")
|
||||
def test_issue595():
|
||||
"""Test lemmatization of base forms"""
|
||||
words = ["Do", "n't", "feed", "the", "dog"]
|
||||
tag_map = {"VB": {POS: VERB, "VerbForm": "inf"}}
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]})
|
||||
lookups.add_table("lemma_index", {"verb": {}})
|
||||
lookups.add_table("lemma_exc", {"verb": {}})
|
||||
lemmatizer = Lemmatizer(lookups, is_base_form=is_base_form)
|
||||
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
||||
vocab = Vocab()
|
||||
doc = Doc(vocab, words=words)
|
||||
doc[2].tag_ = "VB"
|
||||
assert doc[2].text == "feed"
|
||||
|
@ -389,6 +386,7 @@ def test_issue891(en_tokenizer, text):
|
|||
assert tokens[1].text == "/"
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Old vocab-based lemmatization")
|
||||
@pytest.mark.parametrize(
|
||||
"text,tag,lemma",
|
||||
[("anus", "NN", "anus"), ("princess", "NN", "princess"), ("inner", "JJ", "inner")],
|
||||
|
|
|
@ -6,7 +6,6 @@ from spacy.lang.en import English
|
|||
from spacy.lang.lex_attrs import LEX_ATTRS
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from spacy.lemmatizer import Lemmatizer
|
||||
from spacy.lookups import Lookups
|
||||
from spacy.symbols import ORTH, LEMMA, POS, VERB
|
||||
|
||||
|
@ -57,6 +56,7 @@ def test_issue1242():
|
|||
assert len(docs[1]) == 1
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="v3 no longer supports LEMMA/POS in tokenizer special cases")
|
||||
def test_issue1250():
|
||||
"""Test cached special cases."""
|
||||
special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}]
|
||||
|
@ -87,20 +87,6 @@ def test_issue1375():
|
|||
assert doc[1].nbor(1).text == "2"
|
||||
|
||||
|
||||
def test_issue1387():
|
||||
tag_map = {"VBG": {POS: VERB, "VerbForm": "part"}}
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
||||
lemmatizer = Lemmatizer(lookups)
|
||||
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
||||
doc = Doc(vocab, words=["coping"])
|
||||
doc[0].tag_ = "VBG"
|
||||
assert doc[0].text == "coping"
|
||||
assert doc[0].lemma_ == "cope"
|
||||
|
||||
|
||||
def test_issue1434():
|
||||
"""Test matches occur when optional element at end of short doc."""
|
||||
pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}]
|
||||
|
|
|
@ -130,8 +130,6 @@ def test_issue1727():
|
|||
vectors = Vectors(data=data, keys=["I", "am", "Matt"])
|
||||
tagger = nlp.create_pipe("tagger")
|
||||
tagger.add_label("PRP")
|
||||
with pytest.warns(UserWarning):
|
||||
tagger.begin_training()
|
||||
assert tagger.cfg.get("pretrained_dims", 0) == 0
|
||||
tagger.vocab.vectors = vectors
|
||||
with make_tempdir() as path:
|
||||
|
|
|
@ -19,8 +19,8 @@ def test_issue2564():
|
|||
"""Test the tagger sets is_tagged correctly when used via Language.pipe."""
|
||||
nlp = Language()
|
||||
tagger = nlp.add_pipe("tagger")
|
||||
with pytest.warns(UserWarning):
|
||||
tagger.begin_training() # initialise weights
|
||||
tagger.add_label("A")
|
||||
tagger.begin_training()
|
||||
doc = nlp("hello world")
|
||||
assert doc.is_tagged
|
||||
docs = nlp.pipe(["hello", "world"])
|
||||
|
|
|
@ -241,11 +241,11 @@ def test_issue3449():
|
|||
assert t3[5].text == "I"
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue3456():
|
||||
# this crashed because of a padding error in layer.ops.unflatten in thinc
|
||||
nlp = English()
|
||||
nlp.add_pipe("tagger")
|
||||
tagger = nlp.add_pipe("tagger")
|
||||
tagger.add_label("A")
|
||||
nlp.begin_training()
|
||||
list(nlp.pipe(["hi", ""]))
|
||||
|
||||
|
|
|
@ -149,13 +149,15 @@ def test_issue3540(en_vocab):
|
|||
gold_text = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
assert [token.text for token in doc] == gold_text
|
||||
gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
for i, lemma in enumerate(gold_lemma):
|
||||
doc[i].lemma_ = lemma
|
||||
assert [token.lemma_ for token in doc] == gold_lemma
|
||||
vectors_1 = [token.vector for token in doc]
|
||||
assert len(vectors_1) == len(doc)
|
||||
|
||||
with doc.retokenize() as retokenizer:
|
||||
heads = [(doc[3], 1), doc[2]]
|
||||
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
|
||||
attrs = {"POS": ["PROPN", "PROPN"], "LEMMA": ["New", "York"], "DEP": ["pobj", "compound"]}
|
||||
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
|
||||
|
||||
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
|
||||
|
|
|
@ -271,6 +271,7 @@ def test_issue4267():
|
|||
assert token.ent_iob == 2
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="lemmatizer lookups no longer in vocab")
|
||||
def test_issue4272():
|
||||
"""Test that lookup table can be accessed from Token.lemma if no POS tags
|
||||
are available."""
|
||||
|
|
|
@ -62,8 +62,7 @@ def tagger():
|
|||
# need to add model for two reasons:
|
||||
# 1. no model leads to error in serialization,
|
||||
# 2. the affected line is the one for model serialization
|
||||
with pytest.warns(UserWarning):
|
||||
tagger.begin_training(pipeline=nlp.pipeline)
|
||||
tagger.begin_training(pipeline=nlp.pipeline)
|
||||
return tagger
|
||||
|
||||
|
||||
|
|
|
@ -48,7 +48,6 @@ window_size = 1
|
|||
embed_size = 2000
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
||||
|
||||
[components.tagger]
|
||||
factory = "tagger"
|
||||
|
@ -78,7 +77,6 @@ embed_size = 5555
|
|||
window_size = 1
|
||||
maxout_pieces = 7
|
||||
subword_features = false
|
||||
dropout = null
|
||||
"""
|
||||
|
||||
|
||||
|
|
|
@ -44,8 +44,8 @@ def blank_parser(en_vocab):
|
|||
def taggers(en_vocab):
|
||||
cfg = {"model": DEFAULT_TAGGER_MODEL}
|
||||
model = registry.make_from_config(cfg, validate=True)["model"]
|
||||
tagger1 = Tagger(en_vocab, model, set_morphology=True)
|
||||
tagger2 = Tagger(en_vocab, model, set_morphology=True)
|
||||
tagger1 = Tagger(en_vocab, model)
|
||||
tagger2 = Tagger(en_vocab, model)
|
||||
return tagger1, tagger2
|
||||
|
||||
|
||||
|
@ -125,8 +125,8 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
|
|||
tagger2.to_disk(file_path2)
|
||||
cfg = {"model": DEFAULT_TAGGER_MODEL}
|
||||
model = registry.make_from_config(cfg, validate=True)["model"]
|
||||
tagger1_d = Tagger(en_vocab, model, set_morphology=True).from_disk(file_path1)
|
||||
tagger2_d = Tagger(en_vocab, model, set_morphology=True).from_disk(file_path2)
|
||||
tagger1_d = Tagger(en_vocab, model).from_disk(file_path1)
|
||||
tagger2_d = Tagger(en_vocab, model).from_disk(file_path2)
|
||||
assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
|
||||
|
||||
|
||||
|
|
|
@ -8,7 +8,6 @@ from ..util import make_tempdir
|
|||
|
||||
test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])]
|
||||
test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
|
||||
default_strings = ("_SP", "POS=SPACE")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["rat"])
|
||||
|
@ -34,10 +33,8 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
|
|||
assert vocab1.to_bytes() == vocab1_b
|
||||
new_vocab1 = Vocab().from_bytes(vocab1_b)
|
||||
assert new_vocab1.to_bytes() == vocab1_b
|
||||
assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE
|
||||
assert sorted([s for s in new_vocab1.strings]) == sorted(
|
||||
strings1 + list(default_strings)
|
||||
)
|
||||
assert len(new_vocab1.strings) == len(strings1)
|
||||
assert sorted([s for s in new_vocab1.strings]) == sorted(strings1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
||||
|
@ -52,16 +49,12 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
|
|||
vocab1_d = Vocab().from_disk(file_path1)
|
||||
vocab2_d = Vocab().from_disk(file_path2)
|
||||
# check strings rather than lexemes, which are only reloaded on demand
|
||||
assert strings1 == [s for s in vocab1_d.strings if s not in default_strings]
|
||||
assert strings2 == [s for s in vocab2_d.strings if s not in default_strings]
|
||||
assert strings1 == [s for s in vocab1_d.strings]
|
||||
assert strings2 == [s for s in vocab2_d.strings]
|
||||
if strings1 == strings2:
|
||||
assert [s for s in vocab1_d.strings if s not in default_strings] == [
|
||||
s for s in vocab2_d.strings if s not in default_strings
|
||||
]
|
||||
assert [s for s in vocab1_d.strings] == [s for s in vocab2_d.strings]
|
||||
else:
|
||||
assert [s for s in vocab1_d.strings if s not in default_strings] != [
|
||||
s for s in vocab2_d.strings if s not in default_strings
|
||||
]
|
||||
assert [s for s in vocab1_d.strings] != [s for s in vocab2_d.strings]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
||||
|
@ -80,7 +73,7 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr):
|
|||
# Reported in #2153
|
||||
vocab = Vocab(strings=strings)
|
||||
vocab.from_bytes(vocab.to_bytes())
|
||||
assert len(vocab.strings) == len(strings) + 2 # adds _SP and POS=SPACE
|
||||
assert len(vocab.strings) == len(strings)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
||||
|
|
|
@ -484,12 +484,10 @@ def test_roundtrip_docs_to_docbin(doc):
|
|||
json_file = tmpdir / "roundtrip.json"
|
||||
srsly.write_json(json_file, [docs_to_json(doc)])
|
||||
output_file = tmpdir / "roundtrip.spacy"
|
||||
data = DocBin(docs=[doc]).to_bytes()
|
||||
with output_file.open("wb") as file_:
|
||||
file_.write(data)
|
||||
DocBin(docs=[doc]).to_disk(output_file)
|
||||
reader = Corpus(output_file)
|
||||
reloaded_examples = list(reader(reloaded_nlp))
|
||||
assert len(doc) == sum(len(eg) for eg in reloaded_examples)
|
||||
assert len(doc) == sum(len(eg) for eg in reloaded_examples)
|
||||
reloaded_example = reloaded_examples[0]
|
||||
assert text == reloaded_example.reference.text
|
||||
assert idx == [t.idx for t in reloaded_example.reference]
|
||||
|
@ -512,13 +510,11 @@ def test_make_orth_variants(doc):
|
|||
nlp = English()
|
||||
with make_tempdir() as tmpdir:
|
||||
output_file = tmpdir / "roundtrip.spacy"
|
||||
data = DocBin(docs=[doc]).to_bytes()
|
||||
with output_file.open("wb") as file_:
|
||||
file_.write(data)
|
||||
DocBin(docs=[doc]).to_disk(output_file)
|
||||
# due to randomness, test only that this runs with no errors for now
|
||||
reader = Corpus(output_file)
|
||||
train_example = next(reader(nlp))
|
||||
make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
|
||||
make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
|
||||
|
||||
|
||||
@pytest.mark.skip("Outdated")
|
||||
|
|
|
@ -1,64 +0,0 @@
|
|||
import pytest
|
||||
from spacy.tokens import Doc
|
||||
from spacy.language import Language
|
||||
from spacy.lookups import Lookups
|
||||
from spacy.lemmatizer import Lemmatizer
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="We probably don't want to support this anymore in v3?")
|
||||
def test_lemmatizer_reflects_lookups_changes():
|
||||
"""Test for an issue that'd cause lookups available in a model loaded from
|
||||
disk to not be reflected in the lemmatizer."""
|
||||
nlp = Language()
|
||||
assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "foo"
|
||||
table = nlp.vocab.lookups.add_table("lemma_lookup")
|
||||
table["foo"] = "bar"
|
||||
assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "bar"
|
||||
table = nlp.vocab.lookups.get_table("lemma_lookup")
|
||||
table["hello"] = "world"
|
||||
# The update to the table should be reflected in the lemmatizer
|
||||
assert Doc(nlp.vocab, words=["hello"])[0].lemma_ == "world"
|
||||
new_nlp = Language()
|
||||
table = new_nlp.vocab.lookups.add_table("lemma_lookup")
|
||||
table["hello"] = "hi"
|
||||
assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "hi"
|
||||
nlp_bytes = nlp.to_bytes()
|
||||
new_nlp.from_bytes(nlp_bytes)
|
||||
# Make sure we have the previously saved lookup table
|
||||
assert "lemma_lookup" in new_nlp.vocab.lookups
|
||||
assert len(new_nlp.vocab.lookups.get_table("lemma_lookup")) == 2
|
||||
assert new_nlp.vocab.lookups.get_table("lemma_lookup")["hello"] == "world"
|
||||
assert Doc(new_nlp.vocab, words=["foo"])[0].lemma_ == "bar"
|
||||
assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world"
|
||||
|
||||
|
||||
def test_tagger_warns_no_lookups():
|
||||
nlp = Language()
|
||||
nlp.vocab.lookups = Lookups()
|
||||
assert not len(nlp.vocab.lookups)
|
||||
tagger = nlp.add_pipe("tagger")
|
||||
with pytest.warns(UserWarning):
|
||||
tagger.begin_training()
|
||||
with pytest.warns(UserWarning):
|
||||
nlp.begin_training()
|
||||
nlp.vocab.lookups.add_table("lemma_lookup")
|
||||
nlp.vocab.lookups.add_table("lexeme_norm")
|
||||
nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
|
||||
with pytest.warns(None) as record:
|
||||
nlp.begin_training()
|
||||
assert not record.list
|
||||
|
||||
|
||||
def test_lemmatizer_without_is_base_form_implementation():
|
||||
# Norwegian example from #5658
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_rules", {"noun": []})
|
||||
lookups.add_table("lemma_index", {"noun": {}})
|
||||
lookups.add_table("lemma_exc", {"noun": {"formuesskatten": ["formuesskatt"]}})
|
||||
|
||||
lemmatizer = Lemmatizer(lookups, is_base_form=None)
|
||||
assert lemmatizer(
|
||||
"Formuesskatten",
|
||||
"noun",
|
||||
{"Definite": "def", "Gender": "masc", "Number": "sing"},
|
||||
) == ["formuesskatt"]
|
|
@ -259,7 +259,7 @@ def test_tag_score(tagged_doc):
|
|||
assert results["tag_acc"] == 1.0
|
||||
assert results["pos_acc"] == 1.0
|
||||
assert results["morph_acc"] == 1.0
|
||||
assert results["morph_per_feat"]["NounType"].fscore == 1.0
|
||||
assert results["morph_per_feat"]["NounType"]["f"] == 1.0
|
||||
|
||||
# Gold annotation is modified
|
||||
scorer = Scorer()
|
||||
|
@ -282,9 +282,9 @@ def test_tag_score(tagged_doc):
|
|||
assert results["tag_acc"] == 0.9
|
||||
assert results["pos_acc"] == 0.9
|
||||
assert results["morph_acc"] == approx(0.8)
|
||||
assert results["morph_per_feat"]["NounType"].fscore == 1.0
|
||||
assert results["morph_per_feat"]["Poss"].fscore == 0.0
|
||||
assert results["morph_per_feat"]["Number"].fscore == approx(0.72727272)
|
||||
assert results["morph_per_feat"]["NounType"]["f"] == 1.0
|
||||
assert results["morph_per_feat"]["Poss"]["f"] == 0.0
|
||||
assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
|
||||
|
||||
|
||||
def test_roc_auc_score():
|
||||
|
|
|
@ -112,16 +112,15 @@ def test_tokenizer_validate_special_case(tokenizer, text, tokens):
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,tokens", [("lorem", [{"orth": "lo", "tag": "NN"}, {"orth": "rem"}])]
|
||||
"text,tokens", [("lorem", [{"orth": "lo", "norm": "LO"}, {"orth": "rem"}])]
|
||||
)
|
||||
def test_tokenizer_add_special_case_tag(text, tokens):
|
||||
vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
|
||||
vocab = Vocab()
|
||||
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
||||
tokenizer.add_special_case(text, tokens)
|
||||
doc = tokenizer(text)
|
||||
assert doc[0].text == tokens[0]["orth"]
|
||||
assert doc[0].tag_ == tokens[0]["tag"]
|
||||
assert doc[0].pos_ == "NOUN"
|
||||
assert doc[0].norm_ == tokens[0]["norm"]
|
||||
assert doc[1].text == tokens[1]["orth"]
|
||||
|
||||
|
||||
|
|
|
@ -9,7 +9,6 @@ from cymem.cymem cimport Pool
|
|||
from preshed.maps cimport PreshMap
|
||||
cimport cython
|
||||
|
||||
from typing import Dict, List, Union, Pattern, Optional, Any
|
||||
import re
|
||||
import warnings
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ from .span cimport Span
|
|||
from .token cimport Token
|
||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||
from ..structs cimport LexemeC, TokenC
|
||||
from ..attrs cimport TAG, MORPH
|
||||
from ..attrs cimport MORPH
|
||||
from ..vocab cimport Vocab
|
||||
|
||||
from .underscore import is_writable_attr
|
||||
|
@ -365,8 +365,6 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
|||
doc[token_index + i]._.set(ext_attr_key, ext_attr_value)
|
||||
# NB: We need to call get_string_id here because only the keys are
|
||||
# "intified" (since we support "KEY": [value, value] syntax here).
|
||||
elif attr_name == TAG:
|
||||
doc.vocab.morphology.assign_tag(token, get_string_id(attr_value))
|
||||
else:
|
||||
# Set attributes on both token and lexeme to take care of token
|
||||
# attribute vs. lexical attribute without having to enumerate
|
||||
|
@ -431,8 +429,6 @@ def set_token_attrs(Token py_token, attrs):
|
|||
if attr_name == "_": # Set extension attributes
|
||||
for ext_attr_key, ext_attr_value in attr_value.items():
|
||||
py_token._.set(ext_attr_key, ext_attr_value)
|
||||
elif attr_name == TAG:
|
||||
doc.vocab.morphology.assign_tag(token, attr_value)
|
||||
else:
|
||||
# Set attributes on both token and lexeme to take care of token
|
||||
# attribute vs. lexical attribute without having to enumerate
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from typing import Iterable, Iterator
|
||||
from typing import Iterable, Iterator, Union
|
||||
from pathlib import Path
|
||||
import numpy
|
||||
import zlib
|
||||
import srsly
|
||||
|
@ -9,6 +10,7 @@ from ..vocab import Vocab
|
|||
from ..compat import copy_reg
|
||||
from ..attrs import SPACY, ORTH, intify_attr
|
||||
from ..errors import Errors
|
||||
from ..util import ensure_path
|
||||
|
||||
# fmt: off
|
||||
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
|
||||
|
@ -204,6 +206,30 @@ class DocBin:
|
|||
assert len(tokens.shape) == 2, tokens.shape # this should never happen
|
||||
return self
|
||||
|
||||
def to_disk(self, path: Union[str, Path]) -> None:
|
||||
"""Save the DocBin to a file (typically called .spacy).
|
||||
|
||||
path (str / Path): The file path.
|
||||
|
||||
DOCS: https://spacy.io/api/docbin#to_disk
|
||||
"""
|
||||
path = ensure_path(path)
|
||||
with path.open("wb") as file_:
|
||||
file_.write(self.to_bytes())
|
||||
|
||||
def from_disk(self, path: Union[str, Path]) -> "DocBin":
|
||||
"""Load the DocBin from a file (typically called .spacy).
|
||||
|
||||
path (str / Path): The file path.
|
||||
RETURNS (DocBin): The loaded DocBin.
|
||||
|
||||
DOCS: https://spacy.io/api/docbin#to_disk
|
||||
"""
|
||||
path = ensure_path(path)
|
||||
with path.open("rb") as file_:
|
||||
self.from_bytes(file_.read())
|
||||
return self
|
||||
|
||||
|
||||
def merge_bins(bins):
|
||||
merged = None
|
||||
|
|
|
@ -832,13 +832,6 @@ cdef class Doc:
|
|||
rel_head_index=abs_head_index-i
|
||||
)
|
||||
)
|
||||
# Do TAG first. This lets subsequent loop override stuff like POS, LEMMA
|
||||
if TAG in attrs:
|
||||
col = attrs.index(TAG)
|
||||
for i in range(length):
|
||||
value = values[col * stride + i]
|
||||
if value != 0:
|
||||
self.vocab.morphology.assign_tag(&tokens[i], value)
|
||||
# Verify ENT_IOB are proper integers
|
||||
if ENT_IOB in attrs:
|
||||
iob_strings = Token.iob_strings()
|
||||
|
@ -857,12 +850,11 @@ cdef class Doc:
|
|||
for i in range(length):
|
||||
token = &self.c[i]
|
||||
for j in range(n_attrs):
|
||||
if attr_ids[j] != TAG:
|
||||
value = values[j * stride + i]
|
||||
if attr_ids[j] == MORPH:
|
||||
# add morph to morphology table
|
||||
self.vocab.morphology.add(self.vocab.strings[value])
|
||||
Token.set_struct_attr(token, attr_ids[j], value)
|
||||
value = values[j * stride + i]
|
||||
if attr_ids[j] == MORPH:
|
||||
# add morph to morphology table
|
||||
self.vocab.morphology.add(self.vocab.strings[value])
|
||||
Token.set_struct_attr(token, attr_ids[j], value)
|
||||
# Set flags
|
||||
self.is_parsed = bool(self.is_parsed or HEAD in attrs)
|
||||
self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
|
||||
|
|
|
@ -332,11 +332,7 @@ cdef class Token:
|
|||
inflectional suffixes.
|
||||
"""
|
||||
def __get__(self):
|
||||
if self.c.lemma == 0:
|
||||
lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth)
|
||||
return self.vocab.strings[lemma_]
|
||||
else:
|
||||
return self.c.lemma
|
||||
return self.c.lemma
|
||||
|
||||
def __set__(self, attr_t lemma):
|
||||
self.c.lemma = lemma
|
||||
|
@ -355,7 +351,7 @@ cdef class Token:
|
|||
return self.c.tag
|
||||
|
||||
def __set__(self, attr_t tag):
|
||||
self.vocab.morphology.assign_tag(self.c, tag)
|
||||
self.c.tag = tag
|
||||
|
||||
property dep:
|
||||
"""RETURNS (uint64): ID of syntactic dependency label."""
|
||||
|
@ -888,10 +884,7 @@ cdef class Token:
|
|||
with no inflectional suffixes.
|
||||
"""
|
||||
def __get__(self):
|
||||
if self.c.lemma == 0:
|
||||
return self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth)
|
||||
else:
|
||||
return self.vocab.strings[self.c.lemma]
|
||||
return self.vocab.strings[self.c.lemma]
|
||||
|
||||
def __set__(self, unicode lemma_):
|
||||
self.c.lemma = self.vocab.strings.add(lemma_)
|
||||
|
|
|
@ -9,11 +9,10 @@ from .lexeme cimport EMPTY_LEXEME, OOV_RANK
|
|||
from .lexeme cimport Lexeme
|
||||
from .typedefs cimport attr_t
|
||||
from .tokens.token cimport Token
|
||||
from .attrs cimport LANG, ORTH, TAG, POS
|
||||
from .attrs cimport LANG, ORTH
|
||||
|
||||
from .compat import copy_reg
|
||||
from .errors import Errors
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .attrs import intify_attrs, NORM, IS_STOP
|
||||
from .vectors import Vectors
|
||||
from .util import registry
|
||||
|
@ -23,7 +22,7 @@ from .lang.norm_exceptions import BASE_NORMS
|
|||
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
|
||||
|
||||
|
||||
def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_data=True):
|
||||
def create_vocab(lang, defaults, vectors_name=None, load_data=True):
|
||||
# If the spacy-lookups-data package is installed, we pre-populate the lookups
|
||||
# with lexeme data, if available
|
||||
if load_data:
|
||||
|
@ -43,7 +42,6 @@ def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_data=T
|
|||
)
|
||||
return Vocab(
|
||||
lex_attr_getters=lex_attrs,
|
||||
lemmatizer=lemmatizer,
|
||||
lookups=lookups,
|
||||
writing_system=defaults.writing_system,
|
||||
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
|
||||
|
@ -58,17 +56,13 @@ cdef class Vocab:
|
|||
|
||||
DOCS: https://spacy.io/api/vocab
|
||||
"""
|
||||
def __init__(self, lex_attr_getters=None, lemmatizer=None,
|
||||
strings=tuple(), lookups=None, tag_map={},
|
||||
def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
|
||||
oov_prob=-20., vectors_name=None, writing_system={},
|
||||
get_noun_chunks=None, **deprecated_kwargs):
|
||||
"""Create the vocabulary.
|
||||
|
||||
lex_attr_getters (dict): A dictionary mapping attribute IDs to
|
||||
functions to compute them. Defaults to `None`.
|
||||
tag_map (dict): Dictionary mapping fine-grained tags to coarse-grained
|
||||
parts-of-speech, and optionally morphological attributes.
|
||||
lemmatizer (object): A lemmatizer. Defaults to `None`.
|
||||
strings (StringStore): StringStore that maps strings to integers, and
|
||||
vice versa.
|
||||
lookups (Lookups): Container for large lookup tables and dictionaries.
|
||||
|
@ -78,8 +72,6 @@ cdef class Vocab:
|
|||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||
if lookups in (None, True, False):
|
||||
lookups = Lookups()
|
||||
if lemmatizer in (None, True, False):
|
||||
lemmatizer = Lemmatizer(lookups)
|
||||
self.cfg = {'oov_prob': oov_prob}
|
||||
self.mem = Pool()
|
||||
self._by_orth = PreshMap()
|
||||
|
@ -89,7 +81,7 @@ cdef class Vocab:
|
|||
for string in strings:
|
||||
_ = self[string]
|
||||
self.lex_attr_getters = lex_attr_getters
|
||||
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
||||
self.morphology = Morphology(self.strings)
|
||||
self.vectors = Vectors(name=vectors_name)
|
||||
self.lookups = lookups
|
||||
self.writing_system = writing_system
|
||||
|
@ -268,12 +260,6 @@ cdef class Vocab:
|
|||
# Set the special tokens up to have arbitrary attributes
|
||||
lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
|
||||
token.lex = lex
|
||||
if TAG in props:
|
||||
self.morphology.assign_tag(token, props[TAG])
|
||||
elif POS in props:
|
||||
# Don't allow POS to be set without TAG -- this causes problems,
|
||||
# see #1773
|
||||
props.pop(POS)
|
||||
for attr_id, value in props.items():
|
||||
Token.set_struct_attr(token, attr_id, value)
|
||||
# NORM is the only one that overlaps between the two
|
||||
|
|
|
@ -15,37 +15,194 @@ TODO: intro and how architectures work, link to
|
|||
[`registry`](/api/top-level#registry),
|
||||
[custom models](/usage/training#custom-models) usage etc.
|
||||
|
||||
## Tok2Vec architectures {#tok2vec source="spacy/ml/models/tok2vec.py"}
|
||||
## Tok2Vec architectures {#tok2vec-arch source="spacy/ml/models/tok2vec.py"}
|
||||
|
||||
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN}
|
||||
|
||||
<!-- TODO: intro -->
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.HashEmbedCNN.v1"
|
||||
> # TODO: ...
|
||||
>
|
||||
> [model.tok2vec]
|
||||
> # ...
|
||||
> pretrained_vectors = null
|
||||
> width = 96
|
||||
> depth = 4
|
||||
> embed_size = 2000
|
||||
> window_size = 1
|
||||
> maxout_pieces = 3
|
||||
> subword_features = true
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------------- | ----- | ----------- |
|
||||
| `width` | int | |
|
||||
| `depth` | int | |
|
||||
| `embed_size` | int | |
|
||||
| `window_size` | int | |
|
||||
| `maxout_pieces` | int | |
|
||||
| `subword_features` | bool | |
|
||||
| `dropout` | float | |
|
||||
| `pretrained_vectors` | bool | |
|
||||
Build spaCy's 'standard' tok2vec layer, which uses hash embedding with subword
|
||||
features and a CNN with layer-normalized maxout.
|
||||
|
||||
### spacy.HashCharEmbedCNN.v1 {#HashCharEmbedCNN}
|
||||
| Name | Type | Description |
|
||||
| -------------------- | ---- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | int | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. |
|
||||
| `depth` | int | The number of convolutional layers to use. Recommended values are between `2` and `8`. |
|
||||
| `embed_size` | int | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. |
|
||||
| `window_size` | int | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. |
|
||||
| `maxout_pieces` | int | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. |
|
||||
| `subword_features` | bool | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. |
|
||||
| `pretrained_vectors` | bool | Whether to also use static vectors. |
|
||||
|
||||
### spacy.HashCharEmbedBiLSTM.v1 {#HashCharEmbedBiLSTM}
|
||||
### spacy.Tok2Vec.v1 {#Tok2Vec}
|
||||
|
||||
<!-- TODO: example config -->
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.Tok2Vec.v1"
|
||||
>
|
||||
> [model.embed]
|
||||
>
|
||||
> [model.encode]
|
||||
> ```
|
||||
|
||||
Construct a tok2vec model out of embedding and encoding subnetworks. See the
|
||||
["Embed, Encode, Attend, Predict"](https://explosion.ai/blog/deep-learning-formula-nlp)
|
||||
blog post for background.
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `embed` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Embed tokens into context-independent word vector representations. |
|
||||
| `encode` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Floats2d]`. **Output:** `List[Floats2d]`. Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. |
|
||||
|
||||
### spacy.MultiHashEmbed.v1 {#MultiHashEmbed}
|
||||
|
||||
<!-- TODO: check example config -->
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.MultiHashEmbed.v1"
|
||||
> width = 64
|
||||
> rows = 2000
|
||||
> also_embed_subwords = false
|
||||
> also_use_static_vectors = false
|
||||
> ```
|
||||
|
||||
Construct an embedding layer that separately embeds a number of lexical
|
||||
attributes using hash embedding, concatenates the results, and passes it through
|
||||
a feed-forward subnetwork to build a mixed representations. The features used
|
||||
are the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying
|
||||
definitions depending on the `Vocab` of the `Doc` object passed in. Vectors from
|
||||
pretrained static vectors can also be incorporated into the concatenated
|
||||
representation.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------------- | ---- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | int | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. |
|
||||
| `rows` | int | The number of rows for the embedding tables. Can be low, due to the hashing trick. Embeddings for prefix, suffix and word shape use half as many rows. Recommended values are between `2000` and `10000`. |
|
||||
| `also_embed_subwords` | bool | Whether to use the `PREFIX`, `SUFFIX` and `SHAPE` features in the embeddings. If not using these, you may need more rows in your hash embeddings, as there will be increased chance of collisions. |
|
||||
| `also_use_static_vectors` | bool | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. |
|
||||
|
||||
### spacy.CharacterEmbed.v1 {#CharacterEmbed}
|
||||
|
||||
<!-- TODO: check example config -->
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.CharacterEmbed.v1"
|
||||
> width = 64
|
||||
> rows = 2000
|
||||
> nM = 16
|
||||
> nC = 4
|
||||
> ```
|
||||
|
||||
Construct an embedded representations based on character embeddings, using a
|
||||
feed-forward network. A fixed number of UTF-8 byte characters are used for each
|
||||
word, taken from the beginning and end of the word equally. Padding is used in
|
||||
the center for words that are too short.
|
||||
|
||||
For instance, let's say `nC=4`, and the word is "jumping". The characters used
|
||||
will be `"jung"` (two from the start, two from the end). If we had `nC=8`, the
|
||||
characters would be `"jumpping"`: 4 from the start, 4 from the end. This ensures
|
||||
that the final character is always in the last position, instead of being in an
|
||||
arbitrary position depending on the word length.
|
||||
|
||||
The characters are embedded in a embedding table with 256 rows, and the vectors
|
||||
concatenated. A hash-embedded vector of the `NORM` of the word is also
|
||||
concatenated on, and the result is then passed through a feed-forward network to
|
||||
construct a single vector to represent the information.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | int | The width of the output vector and the `NORM` hash embedding. |
|
||||
| `rows` | int | The number of rows in the `NORM` hash embedding table. |
|
||||
| `nM` | int | The dimensionality of the character embeddings. Recommended values are between `16` and `64`. |
|
||||
| `nC` | int | The number of UTF-8 bytes to embed per word. Recommended values are between `3` and `8`, although it may depend on the length of words in the language. |
|
||||
|
||||
### spacy.MaxoutWindowEncoder.v1 {#MaxoutWindowEncoder}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||
> width = 64
|
||||
> window_size = 1
|
||||
> maxout_pieces = 2
|
||||
> depth = 4
|
||||
> ```
|
||||
|
||||
Encode context using convolutions with maxout activation, layer normalization
|
||||
and residual connections.
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `width` | int | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. |
|
||||
| `window_size` | int | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. |
|
||||
| `maxout_pieces` | int | The number of maxout pieces to use. Recommended values are `2` or `3`. |
|
||||
| `depth` | int | The number of convolutional layers. Recommended value is `4`. |
|
||||
|
||||
### spacy.MishWindowEncoder.v1 {#MishWindowEncoder}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.MishWindowEncoder.v1"
|
||||
> width = 64
|
||||
> window_size = 1
|
||||
> depth = 4
|
||||
> ```
|
||||
|
||||
Encode context using convolutions with
|
||||
[`Mish`](https://thinc.ai/docs/api-layers#mish) activation, layer normalization
|
||||
and residual connections.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `width` | int | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. |
|
||||
| `window_size` | int | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. |
|
||||
| `depth` | int | The number of convolutional layers. Recommended value is `4`. |
|
||||
|
||||
### spacy.TorchBiLSTMEncoder.v1 {#TorchBiLSTMEncoder}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.TorchBiLSTMEncoder.v1"
|
||||
> width = 64
|
||||
> window_size = 1
|
||||
> depth = 4
|
||||
> ```
|
||||
|
||||
Encode context using bidirectonal LSTM layers. Requires
|
||||
[PyTorch](https://pytorch.org).
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `width` | int | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. |
|
||||
| `window_size` | int | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. |
|
||||
| `depth` | int | The number of convolutional layers. Recommended value is `4`. |
|
||||
|
||||
## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
|
||||
|
||||
|
@ -98,9 +255,9 @@ architectures into your training config.
|
|||
| `grad_factor` | float | Factor for weighting the gradient if multiple components listen to the same transformer model. |
|
||||
| `pooling` | `Model[Ragged, Floats2d]` | Pooling layer to determine how the vector for each spaCy token will be computed. |
|
||||
|
||||
## Parser & NER architectures {#parser source="spacy/ml/models/parser.py"}
|
||||
## Parser & NER architectures {#parser}
|
||||
|
||||
### spacy.TransitionBasedParser.v1 {#TransitionBasedParser}
|
||||
### spacy.TransitionBasedParser.v1 {#TransitionBasedParser source="spacy/ml/models/parser.py"}
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
|
@ -112,24 +269,100 @@ architectures into your training config.
|
|||
> maxout_pieces = 2
|
||||
>
|
||||
> [model.tok2vec]
|
||||
> # ...
|
||||
> @architectures = "spacy.HashEmbedCNN.v1"
|
||||
> pretrained_vectors = null
|
||||
> width = 96
|
||||
> depth = 4
|
||||
> embed_size = 2000
|
||||
> window_size = 1
|
||||
> maxout_pieces = 3
|
||||
> subword_features = true
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------- | ------------------------------------------ | ----------- |
|
||||
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | |
|
||||
| `nr_feature_tokens` | int | |
|
||||
| `hidden_width` | int | |
|
||||
| `maxout_pieces` | int | |
|
||||
| `use_upper` | bool | |
|
||||
| `nO` | int | |
|
||||
Build a transition-based parser model. Can apply to NER or dependency-parsing.
|
||||
Transition-based parsing is an approach to structured prediction where the task
|
||||
of predicting the structure is mapped to a series of state transitions. You
|
||||
might find [this tutorial](https://explosion.ai/blog/parsing-english-in-python)
|
||||
helpful for background information. The neural network state prediction model
|
||||
consists of either two or three subnetworks:
|
||||
|
||||
- **tok2vec**: Map each token into a vector representations. This subnetwork is
|
||||
run once for each batch.
|
||||
- **lower**: Construct a feature-specific vector for each `(token, feature)`
|
||||
pair. This is also run once for each batch. Constructing the state
|
||||
representation is then simply a matter of summing the component features and
|
||||
applying the non-linearity.
|
||||
- **upper** (optional): A feed-forward network that predicts scores from the
|
||||
state representation. If not present, the output from the lower model is used
|
||||
as action scores directly.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Subnetwork to map tokens into vector representations. |
|
||||
| `nr_feature_tokens` | int | The number of tokens in the context to use to construct the state vector. Valid choices are `1`, `2`, `3`, `6`, `8` and `13`. The `2`, `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. |
|
||||
| `hidden_width` | int | The width of the hidden layer. |
|
||||
| `maxout_pieces` | int | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. |
|
||||
| `use_upper` | bool | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. |
|
||||
| `nO` | int | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. |
|
||||
|
||||
### spacy.BILUOTagger.v1 {#BILUOTagger source="spacy/ml/models/simple_ner.py"}
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.BILUOTagger.v1 "
|
||||
>
|
||||
> [model.tok2vec]
|
||||
> @architectures = "spacy.HashEmbedCNN.v1"
|
||||
> # etc.
|
||||
> ```
|
||||
|
||||
Construct a simple NER tagger that predicts
|
||||
[BILUO](/usage/linguistic-features#accessing-ner) tag scores for each token and
|
||||
uses greedy decoding with transition-constraints to return a valid BILUO tag
|
||||
sequence. A BILUO tag sequence encodes a sequence of non-overlapping labelled
|
||||
spans into tags assigned to each token. The first token of a span is given the
|
||||
tag `B-LABEL`, the last token of the span is given the tag `L-LABEL`, and tokens
|
||||
within the span are given the tag `U-LABEL`. Single-token spans are given the
|
||||
tag `U-LABEL`. All other tokens are assigned the tag `O`. The BILUO tag scheme
|
||||
generally results in better linear separation between classes, especially for
|
||||
non-CRF models, because there are more distinct classes for the different
|
||||
situations ([Ratinov et al., 2009](https://www.aclweb.org/anthology/W09-1119/)).
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------- |
|
||||
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Subnetwork to map tokens into vector representations. |
|
||||
|
||||
### spacy.IOBTagger.v1 {#IOBTagger source="spacy/ml/models/simple_ner.py"}
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.IOBTagger.v1 "
|
||||
>
|
||||
> [model.tok2vec]
|
||||
> @architectures = "spacy.HashEmbedCNN.v1"
|
||||
> # etc.
|
||||
> ```
|
||||
|
||||
Construct a simple NER tagger, that predicts
|
||||
[IOB](/usage/linguistic-features#accessing-ner) tag scores for each token and
|
||||
uses greedy decoding with transition-constraints to return a valid IOB tag
|
||||
sequence. An IOB tag sequence encodes a sequence of non-overlapping labeled
|
||||
spans into tags assigned to each token. The first token of a span is given the
|
||||
tag B-LABEL, and subsequent tokens are given the tag I-LABEL. All other tokens
|
||||
are assigned the tag O.
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------- |
|
||||
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Subnetwork to map tokens into vector representations. |
|
||||
|
||||
## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
|
||||
|
||||
### spacy.Tagger.v1 {#Tagger}
|
||||
|
||||
<!-- TODO: intro -->
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
|
@ -141,26 +374,143 @@ architectures into your training config.
|
|||
> # ...
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------------------------------------ | ----------- |
|
||||
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | |
|
||||
| `nO` | int | |
|
||||
Build a tagger model, using a provided token-to-vector component. The tagger
|
||||
model simply adds a linear layer with softmax activation to predict scores given
|
||||
the token vectors.
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------- |
|
||||
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Subnetwork to map tokens into vector representations. |
|
||||
| `nO` | int | The number of tags to output. Inferred from the data if `None`. |
|
||||
|
||||
## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"}
|
||||
|
||||
A text classification architecture needs to take a [`Doc`](/api/doc) as input,
|
||||
and produce a score for each potential label class. Textcat challenges can be
|
||||
binary (e.g. sentiment analysis) or involve multiple possible labels.
|
||||
Multi-label challenges can either have mutually exclusive labels (each example
|
||||
has exactly one label), or multiple labels may be applicable at the same time.
|
||||
|
||||
As the properties of text classification problems can vary widely, we provide
|
||||
several different built-in architectures. It is recommended to experiment with
|
||||
different architectures and settings to determine what works best on your
|
||||
specific data and challenge.
|
||||
|
||||
### spacy.TextCatEnsemble.v1 {#TextCatEnsemble}
|
||||
|
||||
Stacked ensemble of a bag-of-words model and a neural network model. The neural
|
||||
network has an internal CNN Tok2Vec layer and uses attention.
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.TextCatEnsemble.v1"
|
||||
> exclusive_classes = false
|
||||
> pretrained_vectors = null
|
||||
> width = 64
|
||||
> embed_size = 2000
|
||||
> conv_depth = 2
|
||||
> window_size = 1
|
||||
> ngram_size = 1
|
||||
> dropout = null
|
||||
> nO = null
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. |
|
||||
| `pretrained_vectors` | bool | Whether or not pretrained vectors will be used in addition to the feature vectors. |
|
||||
| `width` | int | Output dimension of the feature encoding step. |
|
||||
| `embed_size` | int | Input dimension of the feature encoding step. |
|
||||
| `conv_depth` | int | Depth of the Tok2Vec layer. |
|
||||
| `window_size` | int | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. |
|
||||
| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. |
|
||||
| `dropout` | float | The dropout rate. |
|
||||
| `nO` | int | Output dimension, determined by the number of different labels. |
|
||||
|
||||
If the `nO` dimension is not set, the TextCategorizer component will set it when
|
||||
`begin_training` is called.
|
||||
|
||||
### spacy.TextCatCNN.v1 {#TextCatCNN}
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.TextCatCNN.v1"
|
||||
> exclusive_classes = false
|
||||
> nO = null
|
||||
>
|
||||
> [model.tok2vec]
|
||||
> @architectures = "spacy.HashEmbedCNN.v1"
|
||||
> pretrained_vectors = null
|
||||
> width = 96
|
||||
> depth = 4
|
||||
> embed_size = 2000
|
||||
> window_size = 1
|
||||
> maxout_pieces = 3
|
||||
> subword_features = true
|
||||
> ```
|
||||
|
||||
A neural network model where token vectors are calculated using a CNN. The
|
||||
vectors are mean pooled and used as features in a feed-forward network. This
|
||||
architecture is usually less accurate than the ensemble, but runs faster.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------- | ------------------------------------------ | --------------------------------------------------------------- |
|
||||
| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. |
|
||||
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | The [`tok2vec`](#tok2vec) layer of the model. |
|
||||
| `nO` | int | Output dimension, determined by the number of different labels. |
|
||||
|
||||
If the `nO` dimension is not set, the TextCategorizer component will set it when
|
||||
`begin_training` is called.
|
||||
|
||||
### spacy.TextCatBOW.v1 {#TextCatBOW}
|
||||
|
||||
### spacy.TextCatCNN.v1 {#TextCatCNN}
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.TextCatBOW.v1"
|
||||
> exclusive_classes = false
|
||||
> ngram_size = 1
|
||||
> no_output_layer = false
|
||||
> nO = null
|
||||
> ```
|
||||
|
||||
An ngram "bag-of-words" model. This architecture should run much faster than the
|
||||
others, but may not be as accurate, especially if texts are short.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. |
|
||||
| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. |
|
||||
| `no_output_layer` | float | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes=True`, else `Logistic`. |
|
||||
| `nO` | int | Output dimension, determined by the number of different labels. |
|
||||
|
||||
If the `nO` dimension is not set, the TextCategorizer component will set it when
|
||||
`begin_training` is called.
|
||||
|
||||
### spacy.TextCatLowData.v1 {#TextCatLowData}
|
||||
|
||||
## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
|
||||
|
||||
An [`EntityLinker`](/api/entitylinker) component disambiguates textual mentions
|
||||
(tagged as named entities) to unique identifiers, grounding the named entities
|
||||
into the "real world". This requires 3 main components:
|
||||
|
||||
- A [`KnowledgeBase`](/api/kb) (KB) holding the unique identifiers, potential
|
||||
synonyms and prior probabilities.
|
||||
- A candidate generation step to produce a set of likely identifiers, given a
|
||||
certain textual mention.
|
||||
- A Machine learning [`Model`](https://thinc.ai/docs/api-model) that picks the
|
||||
most plausible ID from the set of candidates.
|
||||
|
||||
### spacy.EntityLinker.v1 {#EntityLinker}
|
||||
|
||||
<!-- TODO: intro -->
|
||||
The `EntityLinker` model architecture is a `Thinc` `Model` with a Linear output
|
||||
layer.
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
|
@ -170,10 +520,46 @@ architectures into your training config.
|
|||
> nO = null
|
||||
>
|
||||
> [model.tok2vec]
|
||||
> # ...
|
||||
> @architectures = "spacy.HashEmbedCNN.v1"
|
||||
> pretrained_vectors = null
|
||||
> width = 96
|
||||
> depth = 2
|
||||
> embed_size = 300
|
||||
> window_size = 1
|
||||
> maxout_pieces = 3
|
||||
> subword_features = true
|
||||
>
|
||||
> [kb_loader]
|
||||
> @assets = "spacy.EmptyKB.v1"
|
||||
> entity_vector_length = 64
|
||||
>
|
||||
> [get_candidates]
|
||||
> @assets = "spacy.CandidateGenerator.v1"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------------------------------------ | ----------- |
|
||||
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | |
|
||||
| `nO` | int | |
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------------------------------------ | ---------------------------------------------------------------------------------------- |
|
||||
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | The [`tok2vec`](#tok2vec) layer of the model. |
|
||||
| `nO` | int | Output dimension, determined by the length of the vectors encoding each entity in the KB |
|
||||
|
||||
If the `nO` dimension is not set, the Entity Linking component will set it when
|
||||
`begin_training` is called.
|
||||
|
||||
### spacy.EmptyKB.v1 {#EmptyKB}
|
||||
|
||||
A function that creates a default, empty `KnowledgeBase` from a
|
||||
[`Vocab`](/api/vocab) instance.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------------- | ---- | ------------------------------------------------------------------------- |
|
||||
| `entity_vector_length` | int | The length of the vectors encoding each entity in the KB - 64 by default. |
|
||||
|
||||
### spacy.CandidateGenerator.v1 {#CandidateGenerator}
|
||||
|
||||
A function that takes as input a [`KnowledgeBase`](/api/kb) and a
|
||||
[`Span`](/api/span) object denoting a named entity, and returns a list of
|
||||
plausible [`Candidate` objects](/api/kb/#candidate_init).
|
||||
|
||||
The default `CandidateGenerator` simply uses the text of a mention to find its
|
||||
potential aliases in the Knowledgebase. Note that this function is
|
||||
case-dependent.
|
||||
|
|
245
website/docs/api/attributeruler.md
Normal file
245
website/docs/api/attributeruler.md
Normal file
|
@ -0,0 +1,245 @@
|
|||
---
|
||||
title: AttributeRuler
|
||||
tag: class
|
||||
source: spacy/pipeline/attributeruler.py
|
||||
new: 3
|
||||
teaser: 'Pipeline component for rule-based token attribute assignment'
|
||||
api_string_name: attribute_ruler
|
||||
api_trainable: false
|
||||
---
|
||||
|
||||
The attribute ruler lets you set token attributes for tokens identified by
|
||||
[`Matcher` patterns](/usage/rule-based-matching#matcher). The attribute ruler is
|
||||
typically used to handle exceptions for token attributes and to map values
|
||||
between attributes such as mapping fine-grained POS tags to coarse-grained POS
|
||||
tags.
|
||||
|
||||
## Config and implementation {#config}
|
||||
|
||||
The default config is defined by the pipeline component factory and describes
|
||||
how the component should be configured. You can override its settings via the
|
||||
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||
[`config.cfg` for training](/usage/training#config).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> config = {
|
||||
> "pattern_dicts": None,
|
||||
> "validate": True,
|
||||
> }
|
||||
> nlp.add_pipe("attribute_ruler", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| --------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------- | ------- |
|
||||
| `pattern_dicts` | `Iterable[dict]` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](#add) (`patterns`/`attrs`/`index`) to add as patterns. | `None` |
|
||||
| `validate` | bool | Whether patterns should be validated (passed to the `Matcher`). | `False` |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/attributeruler.py
|
||||
```
|
||||
|
||||
## AttributeRuler.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
Initialize the attribute ruler. If pattern dicts are supplied here, they need to
|
||||
be a list of dictionaries with `"patterns"`, `"attrs"`, and optional `"index"`
|
||||
keys, e.g.:
|
||||
|
||||
```python
|
||||
pattern_dicts = \[
|
||||
{"patterns": \[\[{"TAG": "VB"}\]\], "attrs": {"POS": "VERB"}},
|
||||
{"patterns": \[\[{"LOWER": "an"}\]\], "attrs": {"LEMMA": "a"}},
|
||||
\]
|
||||
```
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> # Construction via add_pipe
|
||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------- | ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. |
|
||||
| `name` | str | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. |
|
||||
| _keyword-only_ | | |
|
||||
| `pattern_dicts` | `Iterable[Dict]]` | Optional patterns to load in on initialization. Defaults to `None`. |
|
||||
| `validate` | bool | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. |
|
||||
|
||||
## AttributeRuler.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
Apply the attribute ruler to a Doc, setting token attributes for tokens matched
|
||||
by the provided patterns.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------ |
|
||||
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
|
||||
| **RETURNS** | `Doc` | The modified `Doc` with added entities, if available. |
|
||||
|
||||
## AttributeRuler.add {#add tag="method"}
|
||||
|
||||
Add patterns to the attribute ruler. The patterns are a list of `Matcher`
|
||||
patterns and the attributes are a dict of attributes to set on the matched
|
||||
token. If the pattern matches a span of more than one token, the `index` can be
|
||||
used to set the attributes for the token at that index in the span. The `index`
|
||||
may be negative to index from the end of the span.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||
> patterns = [[{"TAG": "VB"}]]
|
||||
> attrs = {"POS": "VERB"}
|
||||
> attribute_ruler.add(patterns=patterns, attrs=attrs)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ---------------------- | ----------------------------------------------------------------------------------------------------------------------- |
|
||||
| patterns | `Iterable[List[Dict]]` | A list of Matcher patterns. |
|
||||
| attrs | dict | The attributes to assign to the target token in the matched span. |
|
||||
| index | int | The index of the token in the matched span to modify. May be negative to index from the end of the span. Defaults to 0. |
|
||||
|
||||
## AttributeRuler.add_patterns {#add_patterns tag="method"}
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||
> pattern_dicts = \[
|
||||
> {
|
||||
> "patterns": \[\[{"TAG": "VB"}\]\],
|
||||
> "attrs": {"POS": "VERB"}
|
||||
> },
|
||||
> {
|
||||
> "patterns": \[\[{"LOWER": "two"}, {"LOWER": "apples"}\]\],
|
||||
> "attrs": {"LEMMA": "apple"},
|
||||
> "index": -1
|
||||
> },
|
||||
> \]
|
||||
> attribute_ruler.add_patterns(pattern_dicts)
|
||||
> ```
|
||||
|
||||
Add patterns from a list of pattern dicts with the keys as the arguments to
|
||||
[`AttributeRuler.add`](#add).
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------- | ----------------- | -------------------- |
|
||||
| `pattern_dicts` | `Iterable[Dict]]` | The patterns to add. |
|
||||
|
||||
## AttributeRuler.patterns {#patterns tag="property"}
|
||||
|
||||
Get all patterns that have been added to the attribute ruler in the
|
||||
`patterns_dict` format accepted by
|
||||
[`AttributeRuler.add_patterns`](#add_patterns).
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | ------------------------------------------ |
|
||||
| **RETURNS** | `List[dict]` | The patterns added to the attribute ruler. |
|
||||
|
||||
## AttributeRuler.load_from_tag_map {#load_from_tag_map tag="method"}
|
||||
|
||||
Load attribute ruler patterns from a tag map.
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ---- | ------------------------------------------------------------------------------------------ |
|
||||
| `tag_map` | dict | The tag map that maps fine-grained tags to coarse-grained tags and morphological features. |
|
||||
|
||||
## AttributeRuler.load_from_morph_rules {#load_from_morph_rules tag="method"}
|
||||
|
||||
Load attribute ruler patterns from morph rules.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ---- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `morph_rules` | dict | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. |
|
||||
|
||||
## AttributeRuler.to_disk {#to_disk tag="method"}
|
||||
|
||||
Serialize the pipe to disk.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||
> attribute_ruler.to_disk("/path/to/attribute_ruler")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
|
||||
## AttributeRuler.from_disk {#from_disk tag="method"}
|
||||
|
||||
Load the pipe from disk. Modifies the object in place and returns it.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||
> attribute_ruler.from_disk("/path/to/attribute_ruler")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ---------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `AttributeRuler` | The modified `AttributeRuler` object. |
|
||||
|
||||
## AttributeRuler.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||
> attribute_ruler_bytes = attribute_ruler.to_bytes()
|
||||
> ```
|
||||
|
||||
Serialize the pipe to a bytestring.
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `AttributeRuler` object. |
|
||||
|
||||
## AttributeRuler.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> attribute_ruler_bytes = attribute_ruler.to_bytes()
|
||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||
> attribute_ruler.from_bytes(attribute_ruler_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ---------------- | ------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `AttributeRuler` | The `AttributeRuler` object. |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
During serialization, spaCy will export several data fields used to restore
|
||||
different aspects of the object. If needed, you can exclude them from
|
||||
serialization by passing in the string names via the `exclude` argument.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> data = attribute_ruler.to_disk("/path", exclude=["vocab"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------- | -------------------------------------------------------------- |
|
||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||
| `patterns` | The Matcher patterns. You usually don't want to exclude this. |
|
||||
| `attrs` | The attributes to set. You usually don't want to exclude this. |
|
||||
| `indices` | The token indices. You usually don't want to exclude this. |
|
|
@ -132,7 +132,7 @@ $ python -m spacy init config [output] [--base] [--lang] [--model] [--pipeline]
|
|||
| `--base`, `-b` | option | Optional base config file to auto-fill with defaults. |
|
||||
| `--lang`, `-l` | option | Optional language code to use for blank config. If a `--pipeline` is specified, the components will be added in order. |
|
||||
| `--model`, `-m` | option | Optional base model to copy config from. If a `--pipeline` is specified, only those components will be kept, and all other components not in the model will be added. |
|
||||
| `--pipeline`, `-p` | option | Optional comma-separate pipeline of components to add to blank language or model. |
|
||||
| `--pipeline`, `-p` | option | Optional comma-separated pipeline of components to add to blank language or model. |
|
||||
| **CREATES** | config | Complete and auto-filled config file for training. |
|
||||
|
||||
### init model {#init-model new="2"}
|
||||
|
@ -202,7 +202,7 @@ $ python -m spacy convert [input_file] [output_dir] [--converter]
|
|||
| ID | Description |
|
||||
| ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `auto` | Automatically pick converter based on file extension and file content (default). |
|
||||
| `json` | JSON-formatted training data used in spaCy v2.x and produced by [`docs2json`](/api/top-level#docs_to_json). |
|
||||
| `json` | JSON-formatted training data used in spaCy v2.x. |
|
||||
| `conll` | Universal Dependencies `.conllu` or `.conll` format. |
|
||||
| `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
|
||||
| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
|
||||
|
@ -219,23 +219,22 @@ The command will create all objects in the tree and validate them. Note that
|
|||
some config validation errors are blocking and will prevent the rest of the
|
||||
config from being resolved. This means that you may not see all validation
|
||||
errors at once and some issues are only shown once previous errors have been
|
||||
fixed.
|
||||
|
||||
Instead of specifying all required settings in the config file, you can rely on
|
||||
an auto-fill functionality that uses spaCy's built-in defaults. The resulting
|
||||
full config can be written to file and used in downstream training tasks.
|
||||
fixed. To auto-fill a partial config and save the result, you can use the
|
||||
[`init config`](/api/cli#init-config) command.
|
||||
|
||||
```bash
|
||||
$ python -m spacy debug config [config_path] [--code_path] [--output] [--auto_fill] [--diff] [overrides]
|
||||
```
|
||||
|
||||
> #### Example 1
|
||||
> #### Example
|
||||
>
|
||||
> ```bash
|
||||
> $ python -m spacy debug config ./config.cfg
|
||||
> ```
|
||||
|
||||
<Accordion title="Example 1 output" spaced>
|
||||
<Accordion title="Example output" spaced>
|
||||
|
||||
<!-- TODO: update examples with validation error of final config -->
|
||||
|
||||
```
|
||||
✘ Config validation error
|
||||
|
@ -254,30 +253,15 @@ training -> width extra fields not permitted
|
|||
|
||||
</Accordion>
|
||||
|
||||
> #### Example 2
|
||||
>
|
||||
> ```bash
|
||||
> $ python -m spacy debug config ./minimal_config.cfg -F -o ./filled_config.cfg
|
||||
> ```
|
||||
|
||||
<Accordion title="Example 2 output" spaced>
|
||||
|
||||
```
|
||||
✔ Auto-filled config is valid
|
||||
✔ Saved updated config to ./filled_config.cfg
|
||||
```
|
||||
|
||||
</Accordion>
|
||||
|
||||
| Argument | Type | Default | Description |
|
||||
| --------------------- | ---------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `config_path` | positional | - | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||
| `--code_path`, `-c` | option | `None` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
||||
| `--auto_fill`, `-F` | option | `False` | Whether or not to auto-fill the config with built-in defaults if possible. If `False`, the provided config needs to be complete. |
|
||||
| `--output_path`, `-o` | option | `None` | Output path where the filled config can be stored. Use '-' for standard output. |
|
||||
| `--diff`, `-D` | option | `False` | Show a visual diff if config was auto-filled. |
|
||||
| `--help`, `-h` | flag | `False` | Show help message and available arguments. |
|
||||
| overrides | | `None` | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
|
||||
| Argument | Type | Default | Description |
|
||||
| --------------------- | ---------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `config_path` | positional | - | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||
| `--code_path`, `-c` | option | `None` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
||||
| `--auto_fill`, `-F` | option | `False` | Whether or not to auto-fill the config with built-in defaults if possible. If `False`, the provided config needs to be complete. |
|
||||
| `--output_path`, `-o` | option | `None` | Output path where the filled config can be stored. Use '-' for standard output. |
|
||||
| `--diff`, `-D` | option | `False` | Show a visual diff if config was auto-filled. |
|
||||
| `--help`, `-h` | flag | `False` | Show help message and available arguments. |
|
||||
| overrides | | `None` | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. |
|
||||
|
||||
### debug data {#debug-data}
|
||||
|
||||
|
@ -287,21 +271,22 @@ low data labels and more.
|
|||
|
||||
<Infobox title="New in v3.0" variant="warning">
|
||||
|
||||
The `debug-data` command is now available as a subcommand of `spacy debug`. It
|
||||
The `debug data` command is now available as a subcommand of `spacy debug`. It
|
||||
takes the same arguments as `train` and reads settings off the
|
||||
[`config.cfg` file](/usage/training#config).
|
||||
[`config.cfg` file](/usage/training#config) and optional
|
||||
[overrides](/usage/training#config-overrides) on the CLI.
|
||||
|
||||
</Infobox>
|
||||
|
||||
```bash
|
||||
$ python -m spacy debug data [train_path] [dev_path] [config_path] [--code]
|
||||
[--ignore-warnings] [--verbose] [--no-format] [overrides]
|
||||
$ python -m spacy debug data [config_path] [--code] [--ignore-warnings]
|
||||
[--verbose] [--no-format] [overrides]
|
||||
```
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```bash
|
||||
> $ python -m spacy debug data ./train.spacy ./dev.spacy ./config.cfg
|
||||
> $ python -m spacy debug data ./config.cfg
|
||||
> ```
|
||||
|
||||
<Accordion title="Example output" spaced>
|
||||
|
@ -443,17 +428,15 @@ will not be available.
|
|||
|
||||
</Accordion>
|
||||
|
||||
| Argument | Type | Description |
|
||||
| -------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `train_path` | positional | Location of [binary training data](/usage/training#data-format). Can be a file or a directory of files. |
|
||||
| `dev_path` | positional | Location of [binary development data](/usage/training#data-format) for evaluation. Can be a file or a directory of files. |
|
||||
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
||||
| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. |
|
||||
| `--verbose`, `-V` | flag | Print additional information and explanations. |
|
||||
| `--no-format`, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
|
||||
| Argument | Type | Description |
|
||||
| -------------------------- | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
||||
| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. |
|
||||
| `--verbose`, `-V` | flag | Print additional information and explanations. |
|
||||
| `--no-format`, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. |
|
||||
|
||||
<!-- TODO: document debug profile?-->
|
||||
|
||||
|
@ -463,16 +446,20 @@ Debug a Thinc [`Model`](https://thinc.ai/docs/api-model) by running it on a
|
|||
sample text and checking how it updates its internal weights and parameters.
|
||||
|
||||
```bash
|
||||
$ python -m spacy debug model [config_path] [component] [--layers] [-DIM] [-PAR] [-GRAD] [-ATTR] [-P0] [-P1] [-P2] [P3] [--gpu_id]
|
||||
$ python -m spacy debug model [config_path] [component] [--layers] [-DIM]
|
||||
[-PAR] [-GRAD] [-ATTR] [-P0] [-P1] [-P2] [P3] [--gpu-id]
|
||||
```
|
||||
|
||||
> #### Example 1
|
||||
>
|
||||
> ```bash
|
||||
> $ python -m spacy debug model ./config.cfg tagger -P0
|
||||
> ```
|
||||
<Accordion title="Example outputs" spaced>
|
||||
|
||||
<Accordion title="Example 1 output" spaced>
|
||||
In this example log, we just print the name of each layer after creation of the
|
||||
model ("Step 0"), which helps us to understand the internal structure of the
|
||||
Neural Network, and to focus on specific layers that we want to inspect further
|
||||
(see next example).
|
||||
|
||||
```bash
|
||||
$ python -m spacy debug model ./config.cfg tagger -P0
|
||||
```
|
||||
|
||||
```
|
||||
ℹ Using CPU
|
||||
|
@ -509,20 +496,16 @@ $ python -m spacy debug model [config_path] [component] [--layers] [-DIM] [-PAR]
|
|||
...
|
||||
```
|
||||
|
||||
</Accordion>
|
||||
In this example log, we see how initialization of the model (Step 1) propagates
|
||||
the correct values for the `nI` (input) and `nO` (output) dimensions of the
|
||||
various layers. In the `softmax` layer, this step also defines the `W` matrix as
|
||||
an all-zero matrix determined by the `nO` and `nI` dimensions. After a first
|
||||
training step (Step 2), this matrix has clearly updated its values through the
|
||||
training feedback loop.
|
||||
|
||||
In this example log, we just print the name of each layer after creation of the
|
||||
model ("Step 0"), which helps us to understand the internal structure of the
|
||||
Neural Network, and to focus on specific layers that we want to inspect further
|
||||
(see next example).
|
||||
|
||||
> #### Example 2
|
||||
>
|
||||
> ```bash
|
||||
> $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P2
|
||||
> ```
|
||||
|
||||
<Accordion title="Example 2 output" spaced>
|
||||
```bash
|
||||
$ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P2
|
||||
```
|
||||
|
||||
```
|
||||
ℹ Using CPU
|
||||
|
@ -563,27 +546,20 @@ Neural Network, and to focus on specific layers that we want to inspect further
|
|||
|
||||
</Accordion>
|
||||
|
||||
In this example log, we see how initialization of the model (Step 1) propagates
|
||||
the correct values for the `nI` (input) and `nO` (output) dimensions of the
|
||||
various layers. In the `softmax` layer, this step also defines the `W` matrix as
|
||||
an all-zero matrix determined by the `nO` and `nI` dimensions. After a first
|
||||
training step (Step 2), this matrix has clearly updated its values through the
|
||||
training feedback loop.
|
||||
|
||||
| Argument | Type | Default | Description |
|
||||
| ----------------------- | ---------- | ------- | ---------------------------------------------------------------------------------------------------- |
|
||||
| Argument | Type | Default | Description |
|
||||
| ----------------------- | ---------- | ------- | ----------------------------------------------------------------------------------------------------- |
|
||||
| `config_path` | positional | | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||
| `component` | positional | | Name of the pipeline component of which the model should be analysed. |
|
||||
| `--layers`, `-l` | option | | Comma-separated names of layer IDs to print. |
|
||||
| `--dimensions`, `-DIM` | option | `False` | Show dimensions of each layer. |
|
||||
| `--parameters`, `-PAR` | option | `False` | Show parameters of each layer. |
|
||||
| `--gradients`, `-GRAD` | option | `False` | Show gradients of each layer. |
|
||||
| `--attributes`, `-ATTR` | option | `False` | Show attributes of each layer. |
|
||||
| `--print-step0`, `-P0` | option | `False` | Print model before training. |
|
||||
| `--print-step1`, `-P1` | option | `False` | Print model after initialization. |
|
||||
| `--print-step2`, `-P2` | option | `False` | Print model after training. |
|
||||
| `--print-step3`, `-P3` | option | `False` | Print final predictions. |
|
||||
| `--help`, `-h` | flag | | Show help message and available arguments. |
|
||||
| `component` | positional | | Name of the pipeline component of which the model should be analyzed. |
|
||||
| `--layers`, `-l` | option | | Comma-separated names of layer IDs to print. |
|
||||
| `--dimensions`, `-DIM` | option | `False` | Show dimensions of each layer. |
|
||||
| `--parameters`, `-PAR` | option | `False` | Show parameters of each layer. |
|
||||
| `--gradients`, `-GRAD` | option | `False` | Show gradients of each layer. |
|
||||
| `--attributes`, `-ATTR` | option | `False` | Show attributes of each layer. |
|
||||
| `--print-step0`, `-P0` | option | `False` | Print model before training. |
|
||||
| `--print-step1`, `-P1` | option | `False` | Print model after initialization. |
|
||||
| `--print-step2`, `-P2` | option | `False` | Print model after training. |
|
||||
| `--print-step3`, `-P3` | option | `False` | Print final predictions. |
|
||||
| `--help`, `-h` | flag | | Show help message and available arguments. |
|
||||
|
||||
## Train {#train}
|
||||
|
||||
|
@ -603,37 +579,39 @@ you need to manage complex multi-step training workflows, check out the new
|
|||
The `train` command doesn't take a long list of command-line arguments anymore
|
||||
and instead expects a single [`config.cfg` file](/usage/training#config)
|
||||
containing all settings for the pipeline, training process and hyperparameters.
|
||||
Config values can be [overwritten](/usage/training#config-overrides) on the CLI
|
||||
if needed. For example, `--paths.train ./train.spacy` sets the variable `train`
|
||||
in the section `[paths]`.
|
||||
|
||||
</Infobox>
|
||||
|
||||
```bash
|
||||
$ python -m spacy train [train_path] [dev_path] [config_path] [--output]
|
||||
[--code] [--verbose] [overrides]
|
||||
$ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides]
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ----------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `train_path` | positional | Location of training data in spaCy's [binary format](/api/data-formats#training). Can be a file or a directory of files. |
|
||||
| `dev_path` | positional | Location of development data for evaluation in spaCy's [binary format](/api/data-formats#training). Can be a file or a directory of files. |
|
||||
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||
| `--output`, `-o` | positional | Directory to store model in. Will be created if it doesn't exist. |
|
||||
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
||||
| `--verbose`, `-V` | flag | Show more detailed messages during training. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
|
||||
| **CREATES** | model | The final model and the best model. |
|
||||
| Argument | Type | Description |
|
||||
| ----------------- | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||
| `--output`, `-o` | positional | Directory to store model in. Will be created if it doesn't exist. |
|
||||
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
||||
| `--verbose`, `-V` | flag | Show more detailed messages during training. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. |
|
||||
| **CREATES** | model | The final model and the best model. |
|
||||
|
||||
## Pretrain {#pretrain new="2.1" tag="experimental"}
|
||||
|
||||
<!-- TODO: document new pretrain command and link to new pretraining docs -->
|
||||
|
||||
Pre-train the "token to vector" (`tok2vec`) layer of pipeline components, using
|
||||
an approximate language-modeling objective. Specifically, we load pretrained
|
||||
vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which
|
||||
match the pretrained ones. The weights are saved to a directory after each
|
||||
epoch. You can then pass a path to one of these pretrained weights files to the
|
||||
`spacy train` command. This technique may be especially helpful if you have
|
||||
little labelled data.
|
||||
Pre-train the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
|
||||
components on [raw text](/api/data-formats#pretrain), using an approximate
|
||||
language-modeling objective. Specifically, we load pretrained vectors, and train
|
||||
a component like a CNN, BiLSTM, etc to predict vectors which match the
|
||||
pretrained ones. The weights are saved to a directory after each epoch. You can
|
||||
then include a **path to one of these pretrained weights files** in your
|
||||
[training config](/usage/training#config) as the `init_tok2vec` setting when you
|
||||
train your model. This technique may be especially helpful if you have little
|
||||
labelled data.
|
||||
|
||||
<Infobox title="Changed in v3.0" variant="warning">
|
||||
|
||||
|
@ -650,63 +628,33 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path]
|
|||
[--code] [--resume-path] [--epoch-resume] [overrides]
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ----------------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `texts_loc` | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](#pretrain-jsonl) for details. |
|
||||
| `output_dir` | positional | Directory to write models to on each epoch. |
|
||||
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
||||
| `--resume-path`, `-r` | option | TODO: |
|
||||
| `--epoch-resume`, `-er` | option | TODO: |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
|
||||
| **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. |
|
||||
|
||||
### JSONL format for raw text {#pretrain-jsonl}
|
||||
|
||||
Raw text can be provided as a `.jsonl` (newline-delimited JSON) file containing
|
||||
one input text per line (roughly paragraph length is good). Optionally, custom
|
||||
tokenization can be provided.
|
||||
|
||||
> #### Tip: Writing JSONL
|
||||
>
|
||||
> Our utility library [`srsly`](https://github.com/explosion/srsly) provides a
|
||||
> handy `write_jsonl` helper that takes a file path and list of dictionaries and
|
||||
> writes out JSONL-formatted data.
|
||||
>
|
||||
> ```python
|
||||
> import srsly
|
||||
> data = [{"text": "Some text"}, {"text": "More..."}]
|
||||
> srsly.write_jsonl("/path/to/text.jsonl", data)
|
||||
> ```
|
||||
|
||||
| Key | Type | Description |
|
||||
| -------- | ---- | ---------------------------------------------------------- |
|
||||
| `text` | str | The raw input text. Is not required if `tokens` available. |
|
||||
| `tokens` | list | Optional tokenization, one string per token. |
|
||||
|
||||
```json
|
||||
### Example
|
||||
{"text": "Can I ask where you work now and what you do, and if you enjoy it?"}
|
||||
{"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."}
|
||||
{"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."}
|
||||
{"tokens": ["If", "tokens", "are", "provided", "then", "we", "can", "skip", "the", "raw", "input", "text"]}
|
||||
```
|
||||
| Argument | Type | Description |
|
||||
| ----------------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `texts_loc` | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](/api/data-formats#pretrain) for details. |
|
||||
| `output_dir` | positional | Directory to write models to on each epoch. |
|
||||
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
||||
| `--resume-path`, `-r` | option | TODO: |
|
||||
| `--epoch-resume`, `-er` | option | TODO: |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
|
||||
| **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. |
|
||||
|
||||
## Evaluate {#evaluate new="2"}
|
||||
|
||||
<!-- TODO: document new evaluate command -->
|
||||
|
||||
Evaluate a model's accuracy and speed on JSON-formatted annotated data. Will
|
||||
print the results and optionally export
|
||||
[displaCy visualizations](/usage/visualizers) of a sample set of parses to
|
||||
`.html` files. Visualizations for the dependency parse and NER will be exported
|
||||
as separate files if the respective component is present in the model's
|
||||
pipeline.
|
||||
Evaluate a model. Expects a loadable spaCy model and evaluation data in the
|
||||
[binary `.spacy` format](/api/data-formats#binary-training). The
|
||||
`--gold-preproc` option sets up the evaluation examples with gold-standard
|
||||
sentences and tokens for the predictions. Gold preprocessing helps the
|
||||
annotations align to the tokenization, and may result in sequences of more
|
||||
consistent length. However, it may reduce runtime accuracy due to train/test
|
||||
skew. To render a sample of dependency parses in a HTML file using the
|
||||
[displaCy visualizations](/usage/visualizers), set as output directory as the
|
||||
`--displacy-path` argument.
|
||||
|
||||
```bash
|
||||
$ python -m spacy evaluate [model] [data_path] [--output] [--displacy-path]
|
||||
[--displacy-limit] [--gpu-id] [--gold-preproc]
|
||||
$ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc]
|
||||
[--gpu-id] [--displacy-path] [--displacy-limit]
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
|
@ -714,10 +662,10 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--displacy-path]
|
|||
| `model` | positional | Model to evaluate. Can be a package or a path to a model data directory. |
|
||||
| `data_path` | positional | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). |
|
||||
| `--output`, `-o` | option | Output JSON file for metrics. If not set, no metrics will be exported. |
|
||||
| `--gold-preproc`, `-G` | flag | Use gold preprocessing. |
|
||||
| `--gpu-id`, `-g` | option | GPU to use, if any. Defaults to `-1` for CPU. |
|
||||
| `--displacy-path`, `-dp` | option | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. |
|
||||
| `--displacy-limit`, `-dl` | option | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. |
|
||||
| `--gpu-id`, `-g` | option | GPU to use, if any. Defaults to `-1` for CPU. |
|
||||
| `--gold-preproc`, `-G` | flag | Use gold preprocessing. |
|
||||
| **CREATES** | `stdout`, JSON, HTML | Training results and optional metrics and visualizations. |
|
||||
|
||||
## Package {#package}
|
||||
|
|
|
@ -9,7 +9,41 @@ new: 3
|
|||
This class manages annotated corpora and can be used for training and
|
||||
development datasets in the [DocBin](/api/docbin) (`.spacy`) format. To
|
||||
customize the data loading during training, you can register your own
|
||||
[data readers and batchers](/usage/training#custom-code-readers-batchers)
|
||||
[data readers and batchers](/usage/training#custom-code-readers-batchers).
|
||||
|
||||
## Config and implementation {#config}
|
||||
|
||||
`spacy.Corpus.v1` is a registered function that creates a `Corpus` of training
|
||||
or evaluation data. It takes the same arguments as the `Corpus` class and
|
||||
returns a callable that yields [`Example`](/api/example) objects. You can
|
||||
replace it with your own registered function in the
|
||||
[`@readers` registry](/api/top-level#regsitry) to customize the data loading and
|
||||
streaming.
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [paths]
|
||||
> train = "corpus/train.spacy"
|
||||
>
|
||||
> [training.train_corpus]
|
||||
> @readers = "spacy.Corpus.v1"
|
||||
> path = ${paths:train}
|
||||
> gold_preproc = false
|
||||
> max_length = 0
|
||||
> limit = 0
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------- | ------ | ----------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | `Path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). |
|
||||
| `gold_preproc` | bool | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. |
|
||||
| `max_length` | int | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. |
|
||||
| `limit` | int | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/gold/corpus.py
|
||||
```
|
||||
|
||||
## Corpus.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
|
|
|
@ -2,33 +2,222 @@
|
|||
title: Data formats
|
||||
teaser: Details on spaCy's input and output data formats
|
||||
menu:
|
||||
- ['Training Data', 'training']
|
||||
- ['Training Config', 'config']
|
||||
- ['Training Data', 'training']
|
||||
- ['Pretraining Data', 'pretraining']
|
||||
- ['Vocabulary', 'vocab']
|
||||
---
|
||||
|
||||
This section documents input and output formats of data used by spaCy, including
|
||||
training data and lexical vocabulary data. For an overview of label schemes used
|
||||
by the models, see the [models directory](/models). Each model documents the
|
||||
label schemes used in its components, depending on the data it was trained on.
|
||||
the [training config](/usage/training#config), training data and lexical
|
||||
vocabulary data. For an overview of label schemes used by the models, see the
|
||||
[models directory](/models). Each model documents the label schemes used in its
|
||||
components, depending on the data it was trained on.
|
||||
|
||||
## Training config {#config new="3"}
|
||||
|
||||
Config files define the training process and model pipeline and can be passed to
|
||||
[`spacy train`](/api/cli#train). They use
|
||||
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
|
||||
hood. For details on how to use training configs, see the
|
||||
[usage documentation](/usage/training#config).
|
||||
|
||||
<!-- TODO: add details on getting started and init config -->
|
||||
|
||||
> #### What does the @ mean?
|
||||
>
|
||||
> The `@` syntax lets you refer to function names registered in the
|
||||
> [function registry](/api/top-level#registry). For example,
|
||||
> `@architectures = "spacy.HashEmbedCNN.v1"` refers to a registered function of
|
||||
> the name [spacy.HashEmbedCNN.v1](/api/architectures#HashEmbedCNN) and all
|
||||
> other values defined in its block will be passed into that function as
|
||||
> arguments. Those arguments depend on the registered function. See the usage
|
||||
> guide on [registered functions](/usage/training#config-functions) for details.
|
||||
|
||||
```ini
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/default_config.cfg
|
||||
```
|
||||
|
||||
<Infobox title="Notes on data validation" emoji="💡">
|
||||
|
||||
Under the hood, spaCy's configs are powered by our machine learning library
|
||||
[Thinc's config system](https://thinc.ai/docs/usage-config), which uses
|
||||
[`pydantic`](https://github.com/samuelcolvin/pydantic/) for data validation
|
||||
based on type hints. See
|
||||
[`spacy/schemas.py`](https://github.com/explosion/spaCy/blob/develop/spacy/schemas.py)
|
||||
for the schemas used to validate the default config. Arguments of registered
|
||||
functions are validated against their type annotations, if available. To debug
|
||||
your config and check that it's valid, you can run the
|
||||
[`spacy debug config`](/api/cli#debug-config) command.
|
||||
|
||||
</Infobox>
|
||||
|
||||
<!-- TODO: once we know how we want to implement "starter config" workflow or outputting a full default config for the user, update this section with the command -->
|
||||
|
||||
### nlp {#config-nlp tag="section"}
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```ini
|
||||
> [nlp]
|
||||
> lang = "en"
|
||||
> pipeline = ["tagger", "parser", "ner"]
|
||||
> load_vocab_data = true
|
||||
> before_creation = null
|
||||
> after_creation = null
|
||||
> after_pipeline_creation = null
|
||||
>
|
||||
> [nlp.tokenizer]
|
||||
> @tokenizers = "spacy.Tokenizer.v1"
|
||||
> ```
|
||||
|
||||
Defines the `nlp` object, its tokenizer and
|
||||
[processing pipeline](/usage/processing-pipelines) component names.
|
||||
|
||||
| Name | Type | Description | Default |
|
||||
| ------------------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------- |
|
||||
| `lang` | str | The language code to use. | `null` |
|
||||
| `pipeline` | `List[str]` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). | `[]` |
|
||||
| `load_vocab_data` | bool | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. | `true` |
|
||||
| `before_creation` | callable | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. | `null` |
|
||||
| `after_creation` | callable | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. | `null` |
|
||||
| `after_pipeline_creation` | callable | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. | `null` |
|
||||
| `tokenizer` | callable | The tokenizer to use. | [`Tokenizer`](/api/tokenizer) |
|
||||
|
||||
### components {#config-components tag="section"}
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```ini
|
||||
> [components.textcat]
|
||||
> factory = "textcat"
|
||||
> labels = ["POSITIVE", "NEGATIVE"]
|
||||
>
|
||||
> [components.textcat.model]
|
||||
> @architectures = "spacy.TextCatBOW.v1"
|
||||
> exclusive_classes = false
|
||||
> ngram_size = 1
|
||||
> no_output_layer = false
|
||||
> ```
|
||||
|
||||
This section includes definitions of the
|
||||
[pipeline components](/usage/processing-pipelines) and their models, if
|
||||
available. Components in this section can be referenced in the `pipeline` of the
|
||||
`[nlp]` block. Component blocks need to specify either a `factory` (named
|
||||
function to use to create component) or a `source` (name of path of pretrained
|
||||
model to copy components from). See the docs on
|
||||
[defining pipeline components](/usage/training#config-components) for details.
|
||||
|
||||
### paths, system {#config-variables tag="variables"}
|
||||
|
||||
These sections define variables that can be referenced across the other sections
|
||||
as variables. For example `${paths:train}` uses the value of `train` defined in
|
||||
the block `[paths]`. If your config includes custom registered functions that
|
||||
need paths, you can define them here. All config values can also be
|
||||
[overwritten](/usage/training#config-overrides) on the CLI when you run
|
||||
[`spacy train`](/api/cli#train), which is especially relevant for data paths
|
||||
that you don't want to hard-code in your config file.
|
||||
|
||||
```bash
|
||||
$ python -m spacy train ./config.cfg --paths.train ./corpus/train.spacy
|
||||
```
|
||||
|
||||
### training {#config-training tag="section"}
|
||||
|
||||
This section defines settings and controls for the training and evaluation
|
||||
process that are used when you run [`spacy train`](/api/cli#train).
|
||||
|
||||
<!-- TODO: complete -->
|
||||
|
||||
| Name | Type | Description | Default |
|
||||
| --------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------- |
|
||||
| `seed` | int | The random seed. | `${system:seed}` |
|
||||
| `dropout` | float | The dropout rate. | `0.1` |
|
||||
| `accumulate_gradient` | int | Whether to divide the batch up into substeps. | `1` |
|
||||
| `init_tok2vec` | str | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). | `${paths:init_tok2vec}` |
|
||||
| `raw_text` | str | | `${paths:raw}` |
|
||||
| `vectors` | str | | `null` |
|
||||
| `patience` | int | How many steps to continue without improvement in evaluation score. | `1600` |
|
||||
| `max_epochs` | int | Maximum number of epochs to train for. | `0` |
|
||||
| `max_steps` | int | Maximum number of update steps to train for. | `20000` |
|
||||
| `eval_frequency` | int | How often to evaluate during training (steps). | `200` |
|
||||
| `score_weights` | `Dict[str, float]` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. | `{}` |
|
||||
| `frozen_components` | `List[str]` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. | `[]` |
|
||||
| `train_corpus` | callable | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. | [`Corpus`](/api/corpus) |
|
||||
| `dev_corpus` | callable | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. | [`Corpus`](/api/corpus) |
|
||||
| `batcher` | callable | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. | [`batch_by_words`](/api/top-level#batch_by_words) |
|
||||
| `optimizer` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. | [`Adam`](https://thinc.ai/docs/api-optimizers#adam) |
|
||||
|
||||
### pretraining {#config-pretraining tag="section,optional"}
|
||||
|
||||
This section is optional and defines settings and controls for
|
||||
[language model pretraining](/usage/training#pretraining). It's used when you
|
||||
run [`spacy pretrain`](/api/cli#pretrain).
|
||||
|
||||
<!-- TODO: complete -->
|
||||
|
||||
| Name | Type | Description | Default |
|
||||
| ---------------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------- | --------------------------------------------------- |
|
||||
| `max_epochs` | int | Maximum number of epochs. | `1000` |
|
||||
| `min_length` | int | Minimum length of examples. | `5` |
|
||||
| `max_length` | int | Maximum length of examples. | `500` |
|
||||
| `dropout` | float | The dropout rate. | `0.2` |
|
||||
| `n_save_every` | int | Saving frequency. | `null` |
|
||||
| `batch_size` | int / `Sequence[int]` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). | `3000` |
|
||||
| `seed` | int | The random seed. | `${system.seed}` |
|
||||
| `use_pytorch_for_gpu_memory` | bool | Allocate memory via PyTorch. | `${system:use_pytorch_for_gpu_memory}` |
|
||||
| `tok2vec_model` | str | tok2vec model section in the config. | `"components.tok2vec.model"` |
|
||||
| `objective` | dict | The pretraining objective. | `{"type": "characters", "n_characters": 4}` |
|
||||
| `optimizer` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | [`Adam`](https://thinc.ai/docs/api-optimizers#adam) |
|
||||
|
||||
## Training data {#training}
|
||||
|
||||
### Binary training format {#binary-training new="3"}
|
||||
|
||||
<!-- TODO: document DocBin format -->
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.tokens import DocBin
|
||||
> from spacy.gold import Corpus
|
||||
>
|
||||
> doc_bin = DocBin(docs=docs)
|
||||
> doc_bin.to_disk("./data.spacy")
|
||||
> reader = Corpus("./data.spacy")
|
||||
> ```
|
||||
|
||||
### JSON input format for training {#json-input}
|
||||
The main data format used in spaCy v3.0 is a **binary format** created by
|
||||
serializing a [`DocBin`](/api/docbin) object, which represents a collection of
|
||||
`Doc` objects. This means that you can train spaCy models using the same format
|
||||
it outputs: annotated `Doc` objects. The binary format is extremely **efficient
|
||||
in storage**, especially when packing multiple documents together.
|
||||
|
||||
spaCy takes training data in JSON format. The built-in
|
||||
[`convert`](/api/cli#convert) command helps you convert the `.conllu` format
|
||||
used by the
|
||||
[Universal Dependencies corpora](https://github.com/UniversalDependencies) to
|
||||
spaCy's training format. To convert one or more existing `Doc` objects to
|
||||
spaCy's JSON format, you can use the
|
||||
[`gold.docs_to_json`](/api/top-level#docs_to_json) helper.
|
||||
Typically, the extension for these binary files is `.spacy`, and they are used
|
||||
as input format for specifying a [training corpus](/api/corpus) and for spaCy's
|
||||
CLI [`train`](/api/cli#train) command. The built-in
|
||||
[`convert`](/api/cli#convert) command helps you convert spaCy's previous
|
||||
[JSON format](#json-input) to the new binary format format. It also supports
|
||||
conversion of the `.conllu` format used by the
|
||||
[Universal Dependencies corpora](https://github.com/UniversalDependencies).
|
||||
|
||||
> #### Annotating entities {#biluo}
|
||||
### JSON training format {#json-input tag="deprecated"}
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0">
|
||||
|
||||
As of v3.0, the JSON input format is deprecated and is replaced by the
|
||||
[binary format](#binary-training). Instead of converting [`Doc`](/api/doc)
|
||||
objects to JSON, you can now serialize them directly using the
|
||||
[`DocBin`](/api/docbin) container and then use them as input data.
|
||||
|
||||
[`spacy convert`](/api/cli) lets you convert your JSON data to the new `.spacy`
|
||||
format:
|
||||
|
||||
```bash
|
||||
$ python -m spacy convert ./data.json ./output
|
||||
```
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Annotating entities
|
||||
>
|
||||
> Named entities are provided in the
|
||||
> [BILUO](/usage/linguistic-features#accessing-ner) notation. Tokens outside an
|
||||
|
@ -68,152 +257,154 @@ spaCy's JSON format, you can use the
|
|||
}]
|
||||
```
|
||||
|
||||
<Accordion title="Sample JSON data" spaced>
|
||||
|
||||
Here's an example of dependencies, part-of-speech tags and names entities, taken
|
||||
from the English Wall Street Journal portion of the Penn Treebank:
|
||||
|
||||
```json
|
||||
https://github.com/explosion/spaCy/tree/master/examples/training/training-data.json
|
||||
https://github.com/explosion/spaCy/blob/v2.3.x/examples/training/training-data.json
|
||||
```
|
||||
|
||||
### Annotations in dictionary format {#dict-input}
|
||||
</Accordion>
|
||||
|
||||
To create [`Example`](/api/example) objects, you can create a dictionary of the
|
||||
gold-standard annotations `gold_dict`, and then call
|
||||
### Annotation format for creating training examples {#dict-input}
|
||||
|
||||
```python
|
||||
example = Example.from_dict(doc, gold_dict)
|
||||
```
|
||||
An [`Example`](/api/example) object holds the information for one training
|
||||
instance. It stores two [`Doc`](/api/doc) objects: one for holding the
|
||||
gold-standard reference data, and one for holding the predictions of the
|
||||
pipeline. Examples can be created using the
|
||||
[`Example.from_dict`](/api/example#from_dict) method with a reference `Doc` and
|
||||
a dictionary of gold-standard annotations.
|
||||
|
||||
There are currently two formats supported for this dictionary of annotations:
|
||||
one with a simple, flat structure of keywords, and one with a more hierarchical
|
||||
structure.
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> example = Example.from_dict(doc, gold_dict)
|
||||
> ```
|
||||
|
||||
#### Flat structure {#dict-flat}
|
||||
<Infobox title="Important note" variant="warning">
|
||||
|
||||
Here is the full overview of potential entries in a flat dictionary of
|
||||
annotations. You need to only specify those keys corresponding to the task you
|
||||
want to train.
|
||||
`Example` objects are used as part of the
|
||||
[internal training API](/usage/training#api) and they're expected when you call
|
||||
[`nlp.update`](/api/language#update). However, for most use cases, you
|
||||
**shouldn't** have to write your own training scripts. It's recommended to train
|
||||
your models via the [`spacy train`](/api/cli#train) command with a config file
|
||||
to keep track of your settings and hyperparameters and your own
|
||||
[registered functions](/usage/training/#custom-code) to customize the setup.
|
||||
|
||||
```python
|
||||
### Flat dictionary
|
||||
{
|
||||
"text": string, # Raw text.
|
||||
"words": List[string], # List of gold tokens.
|
||||
"lemmas": List[string], # List of lemmas.
|
||||
"spaces": List[bool], # List of boolean values indicating whether the corresponding tokens is followed by a space or not.
|
||||
"tags": List[string], # List of fine-grained [POS tags](/usage/linguistic-features#pos-tagging).
|
||||
"pos": List[string], # List of coarse-grained [POS tags](/usage/linguistic-features#pos-tagging).
|
||||
"morphs": List[string], # List of [morphological features](/usage/linguistic-features#rule-based-morphology).
|
||||
"sent_starts": List[bool], # List of boolean values indicating whether each token is the first of a sentence or not.
|
||||
"deps": List[string], # List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head.
|
||||
"heads": List[int], # List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text.
|
||||
"entities": List[string], # Option 1: List of [BILUO tags](#biluo) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens.
|
||||
"entities": List[(int, int, string)], # Option 2: List of `"(start, end, label)"` tuples defining all entities in.
|
||||
"cats": Dict[str, float], # Dictionary of `label:value` pairs indicating how relevant a certain [category](/api/textcategorizer) is for the text.
|
||||
"links": Dict[(int, int), Dict], # Dictionary of `offset:dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The charachter offsets are linked to a dictionary of relevant knowledge base IDs.
|
||||
}
|
||||
```
|
||||
</Infobox>
|
||||
|
||||
There are a few caveats to take into account:
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> {
|
||||
> "text": str,
|
||||
> "words": List[str],
|
||||
> "lemmas": List[str],
|
||||
> "spaces": List[bool],
|
||||
> "tags": List[str],
|
||||
> "pos": List[str],
|
||||
> "morphs": List[str],
|
||||
> "sent_starts": List[bool],
|
||||
> "deps": List[string],
|
||||
> "heads": List[int],
|
||||
> "entities": List[str],
|
||||
> "entities": List[(int, int, str)],
|
||||
> "cats": Dict[str, float],
|
||||
> "links": Dict[(int, int), dict],
|
||||
> }
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `text` | str | Raw text. |
|
||||
| `words` | `List[str]` | List of gold-standard tokens. |
|
||||
| `lemmas` | `List[str]` | List of lemmas. |
|
||||
| `spaces` | `List[bool]` | List of boolean values indicating whether the corresponding tokens is followed by a space or not. |
|
||||
| `tags` | `List[str]` | List of fine-grained [POS tags](/usage/linguistic-features#pos-tagging). |
|
||||
| `pos` | `List[str]` | List of coarse-grained [POS tags](/usage/linguistic-features#pos-tagging). |
|
||||
| `morphs` | `List[str]` | List of [morphological features](/usage/linguistic-features#rule-based-morphology). |
|
||||
| `sent_starts` | `List[bool]` | List of boolean values indicating whether each token is the first of a sentence or not. |
|
||||
| `deps` | `List[str]` | List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head. |
|
||||
| `heads` | `List[int]` | List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text. |
|
||||
| `entities` | `List[str]` | **Option 1:** List of [BILUO tags](/usage/linguistic-features#accessing-ner) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens. |
|
||||
| `entities` | `List[Tuple[int, int, str]]` | **Option 2:** List of `"(start, end, label)"` tuples defining all entities in the text. |
|
||||
| `cats` | `Dict[str, float]` | Dictionary of `label`/`value` pairs indicating how relevant a certain [text category](/api/textcategorizer) is for the text. |
|
||||
| `links` | `Dict[(int, int), Dict]` | Dictionary of `offset`/`dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The character offsets are linked to a dictionary of relevant knowledge base IDs. |
|
||||
|
||||
<Infobox title="Notes and caveats">
|
||||
|
||||
- Multiple formats are possible for the "entities" entry, but you have to pick
|
||||
one.
|
||||
- Any values for sentence starts will be ignored if there are annotations for
|
||||
dependency relations.
|
||||
- If the dictionary contains values for "text" and "words", but not "spaces",
|
||||
the latter are inferred automatically. If "words" is not provided either, the
|
||||
values are inferred from the `doc` argument.
|
||||
|
||||
##### Examples
|
||||
|
||||
```python
|
||||
# Training data for a part-of-speech tagger
|
||||
doc = Doc(vocab, words=["I", "like", "stuff"])
|
||||
example = Example.from_dict(doc, {"tags": ["NOUN", "VERB", "NOUN"]})
|
||||
|
||||
# Training data for an entity recognizer (option 1)
|
||||
doc = nlp("Laura flew to Silicon Valley.")
|
||||
biluo_tags = ["U-PERS", "O", "O", "B-LOC", "L-LOC"]
|
||||
example = Example.from_dict(doc, {"entities": biluo_tags})
|
||||
|
||||
# Training data for an entity recognizer (option 2)
|
||||
doc = nlp("Laura flew to Silicon Valley.")
|
||||
entity_tuples = [
|
||||
(0, 5, "PERSON"),
|
||||
(14, 28, "LOC"),
|
||||
]
|
||||
example = Example.from_dict(doc, {"entities": entity_tuples})
|
||||
|
||||
# Training data for text categorization
|
||||
doc = nlp("I'm pretty happy about that!")
|
||||
example = Example.from_dict(doc, {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}})
|
||||
|
||||
# Training data for an Entity Linking component
|
||||
doc = nlp("Russ Cochran his reprints include EC Comics.")
|
||||
example = Example.from_dict(doc, {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}})
|
||||
```
|
||||
|
||||
#### Hierachical structure {#dict-hierarch}
|
||||
|
||||
Internally, a more hierarchical dictionary structure is used to store
|
||||
gold-standard annotations. Its format is similar to the structure described in
|
||||
the previous section, but there are two main sections `token_annotation` and
|
||||
`doc_annotation`, and the keys for token annotations should be uppercase
|
||||
[`Token` attributes](/api/token#attributes) such as "ORTH" and "TAG".
|
||||
|
||||
```python
|
||||
### Hierarchical dictionary
|
||||
{
|
||||
"text": string, # Raw text.
|
||||
"token_annotation": {
|
||||
"ORTH": List[string], # List of gold tokens.
|
||||
"LEMMA": List[string], # List of lemmas.
|
||||
"SPACY": List[bool], # List of boolean values indicating whether the corresponding tokens is followed by a space or not.
|
||||
"TAG": List[string], # List of fine-grained [POS tags](/usage/linguistic-features#pos-tagging).
|
||||
"POS": List[string], # List of coarse-grained [POS tags](/usage/linguistic-features#pos-tagging).
|
||||
"MORPH": List[string], # List of [morphological features](/usage/linguistic-features#rule-based-morphology).
|
||||
"SENT_START": List[bool], # List of boolean values indicating whether each token is the first of a sentence or not.
|
||||
"DEP": List[string], # List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head.
|
||||
"HEAD": List[int], # List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text.
|
||||
},
|
||||
"doc_annotation": {
|
||||
"entities": List[(int, int, string)], # List of [BILUO tags](#biluo) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens.
|
||||
"cats": Dict[str, float], # Dictionary of `label:value` pairs indicating how relevant a certain [category](/api/textcategorizer) is for the text.
|
||||
"links": Dict[(int, int), Dict], # Dictionary of `offset:dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The charachter offsets are linked to a dictionary of relevant knowledge base IDs.
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
There are a few caveats to take into account:
|
||||
|
||||
- Any values for sentence starts will be ignored if there are annotations for
|
||||
dependency relations.
|
||||
- If the dictionary contains values for "text" and "ORTH", but not "SPACY", the
|
||||
latter are inferred automatically. If "ORTH" is not provided either, the
|
||||
values are inferred from the `doc` argument.
|
||||
|
||||
## Training config {#config new="3"}
|
||||
|
||||
Config files define the training process and model pipeline and can be passed to
|
||||
[`spacy train`](/api/cli#train). They use
|
||||
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
|
||||
hood. For details on how to use training configs, see the
|
||||
[usage documentation](/usage/training#config).
|
||||
|
||||
<Infobox variant="warning">
|
||||
|
||||
The `@` syntax lets you refer to function names registered in the
|
||||
[function registry](/api/top-level#registry). For example,
|
||||
`@architectures = "spacy.HashEmbedCNN.v1"` refers to a registered function of
|
||||
the name `"spacy.HashEmbedCNN.v1"` and all other values defined in its block
|
||||
will be passed into that function as arguments. Those arguments depend on the
|
||||
registered function. See the [model architectures](/api/architectures) docs for
|
||||
API details.
|
||||
- If the dictionary contains values for `"text"` and `"words"`, but not
|
||||
`"spaces"`, the latter are inferred automatically. If "words" is not provided
|
||||
either, the values are inferred from the `Doc` argument.
|
||||
|
||||
</Infobox>
|
||||
|
||||
<!-- TODO: we need to come up with a good way to present the sections and their expected values visually? -->
|
||||
<!-- TODO: once we know how we want to implement "starter config" workflow or outputting a full default config for the user, update this section with the command -->
|
||||
```python
|
||||
### Examples
|
||||
# Training data for a part-of-speech tagger
|
||||
doc = Doc(vocab, words=["I", "like", "stuff"])
|
||||
gold_dict = {"tags": ["NOUN", "VERB", "NOUN"]}
|
||||
example = Example.from_dict(doc, gold_dict)
|
||||
|
||||
# Training data for an entity recognizer (option 1)
|
||||
doc = nlp("Laura flew to Silicon Valley.")
|
||||
gold_dict = {"entities": ["U-PERS", "O", "O", "B-LOC", "L-LOC"]}
|
||||
example = Example.from_dict(doc, gold_dict)
|
||||
|
||||
# Training data for an entity recognizer (option 2)
|
||||
doc = nlp("Laura flew to Silicon Valley.")
|
||||
gold_dict = {"entities": [(0, 5, "PERSON"), (14, 28, "LOC")]}
|
||||
example = Example.from_dict(doc, gold_dict)
|
||||
|
||||
# Training data for text categorization
|
||||
doc = nlp("I'm pretty happy about that!")
|
||||
gold_dict = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
|
||||
example = Example.from_dict(doc, gold_dict)
|
||||
|
||||
# Training data for an Entity Linking component
|
||||
doc = nlp("Russ Cochran his reprints include EC Comics.")
|
||||
gold_dict = {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}
|
||||
example = Example.from_dict(doc, gold_dict)
|
||||
```
|
||||
|
||||
## Pretraining data {#pretraining}
|
||||
|
||||
The [`spacy pretrain`](/api/cli#pretrain) command lets you pretrain the tok2vec
|
||||
layer of pipeline components from raw text. Raw text can be provided as a
|
||||
`.jsonl` (newline-delimited JSON) file containing one input text per line
|
||||
(roughly paragraph length is good). Optionally, custom tokenization can be
|
||||
provided.
|
||||
|
||||
> #### Tip: Writing JSONL
|
||||
>
|
||||
> Our utility library [`srsly`](https://github.com/explosion/srsly) provides a
|
||||
> handy `write_jsonl` helper that takes a file path and list of dictionaries and
|
||||
> writes out JSONL-formatted data.
|
||||
>
|
||||
> ```python
|
||||
> import srsly
|
||||
> data = [{"text": "Some text"}, {"text": "More..."}]
|
||||
> srsly.write_jsonl("/path/to/text.jsonl", data)
|
||||
> ```
|
||||
|
||||
| Key | Type | Description |
|
||||
| -------- | ---- | ---------------------------------------------------------- |
|
||||
| `text` | str | The raw input text. Is not required if `tokens` available. |
|
||||
| `tokens` | list | Optional tokenization, one string per token. |
|
||||
|
||||
```json
|
||||
### Example
|
||||
{"text": "Can I ask where you work now and what you do, and if you enjoy it?"}
|
||||
{"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."}
|
||||
{"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."}
|
||||
{"tokens": ["If", "tokens", "are", "provided", "then", "we", "can", "skip", "the", "raw", "input", "text"]}
|
||||
```
|
||||
|
||||
## Lexical data for vocabulary {#vocab-jsonl new="2"}
|
||||
|
||||
|
|
|
@ -265,37 +265,6 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
|
|||
| ----------- | -------------------------------------- | ----------------------------------------------- |
|
||||
| **RETURNS** | `numpy.ndarray[ndim=2, dtype="int32"]` | The lowest common ancestor matrix of the `Doc`. |
|
||||
|
||||
## Doc.to_json {#to_json tag="method" new="2.1"}
|
||||
|
||||
Convert a Doc to JSON. The format it produces will be the new format for the
|
||||
[`spacy train`](/api/cli#train) command (not implemented yet). If custom
|
||||
underscore attributes are specified, their values need to be JSON-serializable.
|
||||
They'll be added to an `"_"` key in the data, e.g. `"_": {"foo": "bar"}`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> doc = nlp("Hello")
|
||||
> json_doc = doc.to_json()
|
||||
> ```
|
||||
>
|
||||
> #### Result
|
||||
>
|
||||
> ```python
|
||||
> {
|
||||
> "text": "Hello",
|
||||
> "ents": [],
|
||||
> "sents": [{"start": 0, "end": 5}],
|
||||
> "tokens": [{"id": 0, "start": 0, "end": 5, "pos": "INTJ", "tag": "UH", "dep": "ROOT", "head": 0}
|
||||
> ]
|
||||
> }
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ---- | ------------------------------------------------------------------------------ |
|
||||
| `underscore` | list | Optional list of string names of custom JSON-serializable `doc._.` attributes. |
|
||||
| **RETURNS** | dict | The JSON-formatted data. |
|
||||
|
||||
## Doc.to_array {#to_array tag="method"}
|
||||
|
||||
Export given token attributes to a numpy `ndarray`. If `attr_ids` is a sequence
|
||||
|
|
|
@ -125,7 +125,8 @@ Serialize the `DocBin`'s annotations to a bytestring.
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> doc_bin = DocBin(attrs=["DEP", "HEAD"])
|
||||
> docs = [nlp("Hello world!")]
|
||||
> doc_bin = DocBin(docs=docs)
|
||||
> doc_bin_bytes = doc_bin.to_bytes()
|
||||
> ```
|
||||
|
||||
|
@ -148,3 +149,36 @@ Deserialize the `DocBin`'s annotations from a bytestring.
|
|||
| ------------ | -------- | ---------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| **RETURNS** | `DocBin` | The loaded `DocBin`. |
|
||||
|
||||
## DocBin.to_disk {#to_disk tag="method" new="3"}
|
||||
|
||||
Save the serialized `DocBin` to a file. Typically uses the `.spacy` extension
|
||||
and the result can be used as the input data for
|
||||
[`spacy train`](/api/cli#train).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> docs = [nlp("Hello world!")]
|
||||
> doc_bin = DocBin(docs=docs)
|
||||
> doc_bin.to_disk("./data.spacy")
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| -------- | ------------ | ----------------------------------------------------- |
|
||||
| `path` | str / `Path` | The file path, typically with the `.spacy` extension. |
|
||||
|
||||
## DocBin.from_disk {#from_disk tag="method" new="3"}
|
||||
|
||||
Load a serialized `DocBin` from a file. Typically uses the `.spacy` extension.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> doc_bin = DocBin().from_disk("./data.spacy")
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ----------- | ------------ | ----------------------------------------------------- |
|
||||
| `path` | str / `Path` | The file path, typically with the `.spacy` extension. |
|
||||
| **RETURNS** | `DocBin` | The loaded `DocBin`. |
|
||||
|
|
|
@ -9,6 +9,13 @@ api_string_name: entity_linker
|
|||
api_trainable: true
|
||||
---
|
||||
|
||||
An `EntityLinker` component disambiguates textual mentions (tagged as named
|
||||
entities) to unique identifiers, grounding the named entities into the "real
|
||||
world". It requires a `KnowledgeBase`, as well as a function to generate
|
||||
plausible candidates from that `KnowledgeBase` given a certain textual mention,
|
||||
and a ML model to pick the right candidate, given the local context of the
|
||||
mention.
|
||||
|
||||
## Config and implementation {#config}
|
||||
|
||||
The default config is defined by the pipeline component factory and describes
|
||||
|
@ -23,22 +30,24 @@ architectures and their arguments and hyperparameters.
|
|||
> ```python
|
||||
> from spacy.pipeline.entity_linker import DEFAULT_NEL_MODEL
|
||||
> config = {
|
||||
> "kb": None,
|
||||
> "labels_discard": [],
|
||||
> "incl_prior": True,
|
||||
> "incl_context": True,
|
||||
> "model": DEFAULT_NEL_MODEL,
|
||||
> "kb_loader": {'@assets': 'spacy.EmptyKB.v1', 'entity_vector_length': 64},
|
||||
> "get_candidates": {'@assets': 'spacy.CandidateGenerator.v1'},
|
||||
> }
|
||||
> nlp.add_pipe("entity_linker", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| ---------------- | ------------------------------------------ | ----------------------------------------------------------------------- | ----------------------------------------------- |
|
||||
| `kb` | `KnowledgeBase` | The [`KnowledgeBase`](/api/kb) holding all entities and their aliases. | `None` |
|
||||
| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. | `[]` |
|
||||
| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. | `True` |
|
||||
| `incl_context` | bool | Whether or not to include the local context in the model. | `True` |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) |
|
||||
| Setting | Type | Description | Default |
|
||||
| ---------------- | -------------------------------------------------------- | --------------------------------------------------------------------------- | ------------------------------------------------------ |
|
||||
| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. | `[]` |
|
||||
| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. | `True` |
|
||||
| `incl_context` | bool | Whether or not to include the local context in the model. | `True` |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) |
|
||||
| `kb_loader` | `Callable[[Vocab], KnowledgeBase]` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. | An empty KnowledgeBase with `entity_vector_length` 64. |
|
||||
| `get_candidates` | `Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]` | Function that generates plausible candidates for a given `Span` object. | Built-in dictionary-lookup function. |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py
|
||||
|
@ -53,7 +62,11 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py
|
|||
> entity_linker = nlp.add_pipe("entity_linker")
|
||||
>
|
||||
> # Construction via add_pipe with custom model
|
||||
> config = {"model": {"@architectures": "my_el"}}
|
||||
> config = {"model": {"@architectures": "my_el.v1"}}
|
||||
> entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||
>
|
||||
> # Construction via add_pipe with custom KB and candidate generation
|
||||
> config = {"kb_loader": {"@assets": "my_kb.v1"}, "get_candidates": {"@assets": "my_candidates.v1"},}
|
||||
> entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||
>
|
||||
> # Construction from class
|
||||
|
@ -65,18 +78,20 @@ Create a new pipeline instance. In your application, you would normally use a
|
|||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
<!-- TODO: finish API docs -->
|
||||
Note that both the internal KB as well as the Candidate generator can be
|
||||
customized by providing custom registered functions.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------- | --------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| _keyword-only_ | | |
|
||||
| `kb` | `KnowlegeBase` | The [`KnowledgeBase`](/api/kb) holding all entities and their aliases. |
|
||||
| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. |
|
||||
| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. |
|
||||
| `incl_context` | bool | Whether or not to include the local context in the model. |
|
||||
| Name | Type | Description |
|
||||
| ---------------- | -------------------------------------------------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| _keyword-only_ | | |
|
||||
| `kb_loader` | `Callable[[Vocab], KnowledgeBase]` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. |
|
||||
| `get_candidates` | `Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]` | Function that generates plausible candidates for a given `Span` object. |
|
||||
| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. |
|
||||
| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. |
|
||||
| `incl_context` | bool | Whether or not to include the local context in the model. |
|
||||
|
||||
## EntityLinker.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
|
|
@ -27,7 +27,7 @@ how the component should be configured. You can override its settings via the
|
|||
> ```python
|
||||
> config = {
|
||||
> "phrase_matcher_attr": None,
|
||||
> "validation": True,
|
||||
> "validate": True,
|
||||
> "overwrite_ents": False,
|
||||
> "ent_id_sep": "||",
|
||||
> }
|
||||
|
@ -37,7 +37,7 @@ how the component should be configured. You can override its settings via the
|
|||
| Setting | Type | Description | Default |
|
||||
| --------------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
|
||||
| `phrase_matcher_attr` | str | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. | `None` |
|
||||
| `validation` | bool | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. | `False` |
|
||||
| `validate` | bool | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). | `False` |
|
||||
| `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. | `False` |
|
||||
| `ent_id_sep` | str | Separator used internally for entity IDs. | `"||"` |
|
||||
|
||||
|
|
|
@ -244,8 +244,7 @@ accuracy of predicted entities against the original gold-standard annotation.
|
|||
|
||||
## Example.to_dict {#to_dict tag="method"}
|
||||
|
||||
Return a
|
||||
[hierarchical dictionary representation](/api/data-formats#dict-hierarch) of the
|
||||
Return a [dictionary representation](/api/data-formats#dict-input) of the
|
||||
reference annotation contained in this `Example`.
|
||||
|
||||
> #### Example
|
||||
|
@ -256,7 +255,7 @@ reference annotation contained in this `Example`.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | ------------------------------------------------------ |
|
||||
| **RETURNS** | `Dict[str, obj]` | Dictionary representation of the reference annotation. |
|
||||
| **RETURNS** | `Dict[str, Any]` | Dictionary representation of the reference annotation. |
|
||||
|
||||
## Example.split_sents {#split_sents tag="method"}
|
||||
|
||||
|
|
|
@ -1,102 +1,263 @@
|
|||
---
|
||||
title: Lemmatizer
|
||||
teaser: Assign the base forms of words
|
||||
tag: class
|
||||
source: spacy/lemmatizer.py
|
||||
source: spacy/pipeline/lemmatizer.py
|
||||
new: 3
|
||||
teaser: 'Pipeline component for lemmatization'
|
||||
api_base_class: /api/pipe
|
||||
api_string_name: lemmatizer
|
||||
api_trainable: false
|
||||
---
|
||||
|
||||
<!-- TODO: rewrite once it's converted to pipe -->
|
||||
## Config and implementation
|
||||
|
||||
The `Lemmatizer` supports simple part-of-speech-sensitive suffix rules and
|
||||
lookup tables.
|
||||
The default config is defined by the pipeline component factory and describes
|
||||
how the component should be configured. You can override its settings via the
|
||||
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||
[`config.cfg` for training](/usage/training#config).
|
||||
|
||||
For examples of the lookups data formats used by the lookup and rule-based
|
||||
lemmatizers, see the
|
||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) repo.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> config = {"mode": "rule"}
|
||||
> nlp.add_pipe("lemmatizer", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| ----------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- |
|
||||
| `mode` | str | The lemmatizer mode, e.g. "lookup" or "rule". | `"lookup"` |
|
||||
| `lookups` | [`Lookups`](/api/lookups) | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from `spacy-lookups-data`. | `None` |
|
||||
| `overwrite` | bool | Whether to overwrite existing lemmas. | `False` |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Not yet implemented:** the model to use. | `None` |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/lemmatizer.py
|
||||
```
|
||||
|
||||
## Lemmatizer.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
Initialize a `Lemmatizer`. Typically, this happens under the hood within spaCy
|
||||
when a `Language` subclass and its `Vocab` is initialized.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.lemmatizer import Lemmatizer
|
||||
> from spacy.lookups import Lookups
|
||||
> lookups = Lookups()
|
||||
> lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
|
||||
> lemmatizer = Lemmatizer(lookups)
|
||||
> ```
|
||||
> # Construction via add_pipe with default model
|
||||
> lemmatizer = nlp.add_pipe("lemmatizer")
|
||||
>
|
||||
> For examples of the data format, see the
|
||||
> [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) repo.
|
||||
> # Construction via add_pipe with custom settings
|
||||
> config = {"mode": "rule", overwrite=True}
|
||||
> lemmatizer = nlp.add_pipe("lemmatizer", config=config)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lookups` <Tag variant="new">2.2</Tag> | [`Lookups`](/api/lookups) | The lookups object containing the (optional) tables `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. |
|
||||
Create a new pipeline instance. In your application, you would normally use a
|
||||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | [`Vocab`](/api/vocab) | The vocab. |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model (not yet implemented). |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| _keyword-only_ | | |
|
||||
| mode | str | The lemmatizer mode, e.g. "lookup" or "rule". Defaults to "lookup". |
|
||||
| lookups | [`Lookups`](/api/lookups) | A lookups object containing the tables such as "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". Defaults to `None`. |
|
||||
| overwrite | bool | Whether to overwrite existing lemmas. |
|
||||
|
||||
## Lemmatizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
Lemmatize a string.
|
||||
Apply the pipe to one document. The document is modified in place, and returned.
|
||||
This usually happens under the hood when the `nlp` object is called on a text
|
||||
and all pipeline components are applied to the `Doc` in order.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.lemmatizer import Lemmatizer
|
||||
> from spacy.lookups import Lookups
|
||||
> lookups = Lookups()
|
||||
> lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
|
||||
> lemmatizer = Lemmatizer(lookups)
|
||||
> lemmas = lemmatizer("ducks", "NOUN")
|
||||
> assert lemmas == ["duck"]
|
||||
> doc = nlp("This is a sentence.")
|
||||
> lemmatizer = nlp.add_pipe("lemmatizer")
|
||||
> # This usually happens under the hood
|
||||
> processed = lemmatizer(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ------------- | -------------------------------------------------------------------------------------------------------- |
|
||||
| `string` | str | The string to lemmatize, e.g. the token text. |
|
||||
| `univ_pos` | str / int | The token's universal part-of-speech tag. |
|
||||
| `morphology` | dict / `None` | Morphological features following the [Universal Dependencies](http://universaldependencies.org/) scheme. |
|
||||
| **RETURNS** | list | The available lemmas for the string. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------ |
|
||||
| `doc` | `Doc` | The document to process. |
|
||||
| **RETURNS** | `Doc` | The processed document. |
|
||||
|
||||
## Lemmatizer.lookup {#lookup tag="method" new="2"}
|
||||
## Lemmatizer.pipe {#pipe tag="method"}
|
||||
|
||||
Look up a lemma in the lookup table, if available. If no lemma is found, the
|
||||
Apply the pipe to a stream of documents. This usually happens under the hood
|
||||
when the `nlp` object is called on a text and all pipeline components are
|
||||
applied to the `Doc` in order.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> lemmatizer = nlp.add_pipe("lemmatizer")
|
||||
> for doc in lemmatizer.pipe(docs, batch_size=50):
|
||||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------ |
|
||||
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||
| _keyword-only_ | | |
|
||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||
|
||||
## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"}
|
||||
|
||||
Lemmatize a token using a lookup-based approach. If no lemma is found, the
|
||||
original string is returned. Languages can provide a
|
||||
[lookup table](/usage/adding-languages#lemmatizer) via the `Lookups`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> lookups = Lookups()
|
||||
> lookups.add_table("lemma_lookup", {"going": "go"})
|
||||
> assert lemmatizer.lookup("going") == "go"
|
||||
> ```
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------- | ------------------------------------- |
|
||||
| `token` | [`Token`](/api/token) | The token to lemmatize. |
|
||||
| **RETURNS** | `List[str]` | A list containing one or more lemmas. |
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ----------------------------------------------------------------------------------------------------------- |
|
||||
| `string` | str | The string to look up. |
|
||||
| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. |
|
||||
| **RETURNS** | str | The lemma if the string was found, otherwise the original string. |
|
||||
## Lemmatizer.rule_lemmatize {#rule_lemmatize tag="method"}
|
||||
|
||||
Lemmatize a token using a rule-based approach. Typically relies on POS tags.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------- | ------------------------------------- |
|
||||
| `token` | [`Token`](/api/token) | The token to lemmatize. |
|
||||
| **RETURNS** | `List[str]` | A list containing one or more lemmas. |
|
||||
|
||||
## Lemmatizer.is_base_form {#is_base_form tag="method"}
|
||||
|
||||
Check whether we're dealing with an uninflected paradigm, so we can avoid
|
||||
lemmatization entirely.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------- | ------------------------------------------------------------------------------------------------------- |
|
||||
| `token` | [`Token`](/api/token) | The token to analyze. |
|
||||
| **RETURNS** | bool | Whether the token's attributes (e.g., part-of-speech tag, morphological features) describe a base form. |
|
||||
|
||||
## Lemmatizer.get_lookups_config {#get_lookups_config tag="classmethod"}
|
||||
|
||||
Returns the lookups configuration settings for a given mode for use in
|
||||
[`Lemmatizer.load_lookups`](#load_lookups).
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------------------- |
|
||||
| `mode` | str | The lemmatizer mode. |
|
||||
| **RETURNS** | dict | The lookups configuration settings for this mode. |
|
||||
|
||||
## Lemmatizer.load_lookups {#load_lookups tag="classmethod"}
|
||||
|
||||
Load and validate lookups tables. If the provided lookups is `None`, load the
|
||||
default lookups tables according to the language and mode settings. Confirm that
|
||||
all required tables for the language and mode are present.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------------- | ---------------------------------------------------------------------------- |
|
||||
| `lang` | str | The language. |
|
||||
| `mode` | str | The lemmatizer mode. |
|
||||
| `lookups` | [`Lookups`](/api/lookups) | The provided lookups, may be `None` if the default lookups should be loaded. |
|
||||
| **RETURNS** | [`Lookups`](/api/lookups) | The lookups object. |
|
||||
|
||||
## Lemmatizer.to_disk {#to_disk tag="method"}
|
||||
|
||||
Serialize the pipe to disk.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> pos = "verb"
|
||||
> morph = {"VerbForm": "inf"}
|
||||
> is_base_form = lemmatizer.is_base_form(pos, morph)
|
||||
> assert is_base_form == True
|
||||
> lemmatizer = nlp.add_pipe("lemmatizer")
|
||||
> lemmatizer.to_disk("/path/to/lemmatizer")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | --------- | --------------------------------------------------------------------------------------- |
|
||||
| `univ_pos` | str / int | The token's universal part-of-speech tag. |
|
||||
| `morphology` | dict | The token's morphological features. |
|
||||
| **RETURNS** | bool | Whether the token's part-of-speech tag and morphological features describe a base form. |
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
|
||||
## Lemmatizer.from_disk {#from_disk tag="method"}
|
||||
|
||||
Load the pipe from disk. Modifies the object in place and returns it.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> lemmatizer = nlp.add_pipe("lemmatizer")
|
||||
> lemmatizer.from_disk("/path/to/lemmatizer")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Lemmatizer` | The modified `Lemmatizer` object. |
|
||||
|
||||
## Lemmatizer.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> lemmatizer = nlp.add_pipe("lemmatizer")
|
||||
> lemmatizer_bytes = lemmatizer.to_bytes()
|
||||
> ```
|
||||
|
||||
Serialize the pipe to a bytestring.
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `Lemmatizer` object. |
|
||||
|
||||
## Lemmatizer.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> lemmatizer_bytes = lemmatizer.to_bytes()
|
||||
> lemmatizer = nlp.add_pipe("lemmatizer")
|
||||
> lemmatizer.from_bytes(lemmatizer_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Lemmatizer` | The `Lemmatizer` object. |
|
||||
|
||||
## Lemmatizer.mode {#mode tag="property"}
|
||||
|
||||
The lemmatizer mode.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | -------------------- |
|
||||
| **RETURNS** | `str` | The lemmatizer mode. |
|
||||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------------------------------- | ------------------------- | --------------------------------------------------------------- |
|
||||
| `lookups` <Tag variant="new">2.2</Tag> | [`Lookups`](/api/lookups) | The lookups object containing the rules and data, if available. |
|
||||
| Name | Type | Description |
|
||||
| --------- | --------------------------------- | ------------------- |
|
||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||
| `lookups` | [`Lookups`](/api/lookups) | The lookups object. |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
During serialization, spaCy will export several data fields used to restore
|
||||
different aspects of the object. If needed, you can exclude them from
|
||||
serialization by passing in the string names via the `exclude` argument.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> data = lemmatizer.to_disk("/path", exclude=["vocab"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------- | ---------------------------------------------------- |
|
||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||
| `lookups` | The lookups. You usually don't want to exclude this. |
|
||||
|
|
|
@ -11,22 +11,19 @@ this class.
|
|||
|
||||
## Morphology.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
Create a Morphology object using the tag map, lemmatizer and exceptions.
|
||||
Create a Morphology object.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.morphology import Morphology
|
||||
>
|
||||
> morphology = Morphology(strings, tag_map, lemmatizer)
|
||||
> morphology = Morphology(strings)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ----------------- | ---------------------------------------------------------------------------------------------------------- |
|
||||
| `strings` | `StringStore` | The string store. |
|
||||
| `tag_map` | `Dict[str, Dict]` | The tag map. |
|
||||
| `lemmatizer` | `Lemmatizer` | The lemmatizer. |
|
||||
| `exc` | `Dict[str, Dict]` | A dictionary of exceptions in the format `{tag: {orth: {"POS": "X", "Feat1": "Val1, "Feat2": "Val2", ...}` |
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------- | ----------------- |
|
||||
| `strings` | `StringStore` | The string store. |
|
||||
|
||||
## Morphology.add {#add tag="method"}
|
||||
|
||||
|
@ -62,52 +59,6 @@ Get the FEATS string for the hash of the morphological analysis.
|
|||
| ------- | ---- | --------------------------------------- |
|
||||
| `morph` | int | The hash of the morphological analysis. |
|
||||
|
||||
## Morphology.load_tag_map {#load_tag_map tag="method"}
|
||||
|
||||
Replace the current tag map with the provided tag map.
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ----------------- | ------------ |
|
||||
| `tag_map` | `Dict[str, Dict]` | The tag map. |
|
||||
|
||||
## Morphology.load_morph_exceptions {#load_morph_exceptions tag="method"}
|
||||
|
||||
Replace the current morphological exceptions with the provided exceptions.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ----------------- | ----------------------------- |
|
||||
| `morph_rules` | `Dict[str, Dict]` | The morphological exceptions. |
|
||||
|
||||
## Morphology.add_special_case {#add_special_case tag="method"}
|
||||
|
||||
Add a special-case rule to the morphological analyzer. Tokens whose tag and orth
|
||||
match the rule will receive the specified properties.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> attrs = {"POS": "DET", "Definite": "Def"}
|
||||
> morphology.add_special_case("DT", "the", attrs)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ---- | ---------------------------------------------- |
|
||||
| `tag_str` | str | The fine-grained tag. |
|
||||
| `orth_str` | str | The token text. |
|
||||
| `attrs` | dict | The features to assign for this token and tag. |
|
||||
|
||||
## Morphology.exc {#exc tag="property"}
|
||||
|
||||
The current morphological exceptions.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ---- | --------------------------------------------------- |
|
||||
| **YIELDS** | dict | The current dictionary of morphological exceptions. |
|
||||
|
||||
## Morphology.lemmatize {#lemmatize tag="method"}
|
||||
|
||||
TODO
|
||||
|
||||
## Morphology.feats_to_dict {#feats_to_dict tag="staticmethod"}
|
||||
|
||||
Convert a string FEATS representation to a dictionary of features and values in
|
||||
|
|
|
@ -47,7 +47,7 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tagger.pyx
|
|||
>
|
||||
> # Construction via create_pipe with custom model
|
||||
> config = {"model": {"@architectures": "my_tagger"}}
|
||||
> parser = nlp.add_pipe("tagger", config=config)
|
||||
> tagger = nlp.add_pipe("tagger", config=config)
|
||||
>
|
||||
> # Construction from class
|
||||
> from spacy.pipeline import Tagger
|
||||
|
@ -285,16 +285,14 @@ Add a new label to the pipe.
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.symbols import POS
|
||||
> tagger = nlp.add_pipe("tagger")
|
||||
> tagger.add_label("MY_LABEL", {POS: "NOUN"})
|
||||
> tagger.add_label("MY_LABEL")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | --------------------------------------------------------------- |
|
||||
| `label` | str | The label to add. |
|
||||
| `values` | `Dict[int, str]` | Optional values to map to the label, e.g. a tag map dictionary. |
|
||||
| **RETURNS** | int | `0` if the label is already present, otherwise `1`. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | --------------------------------------------------- |
|
||||
| `label` | str | The label to add. |
|
||||
| **RETURNS** | int | `0` if the label is already present, otherwise `1`. |
|
||||
|
||||
## Tagger.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -369,9 +367,7 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
|||
|
||||
## Tagger.labels {#labels tag="property"}
|
||||
|
||||
The labels currently added to the component. Note that even for a blank
|
||||
component, this will always include the built-in coarse-grained part-of-speech
|
||||
tags by default, e.g. `VERB`, `NOUN` and so on.
|
||||
The labels currently added to the component.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -396,9 +392,8 @@ serialization by passing in the string names via the `exclude` argument.
|
|||
> data = tagger.to_disk("/path", exclude=["vocab"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------- | ------------------------------------------------------------------------------------------ |
|
||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||
| `cfg` | The config file. You usually don't want to exclude this. |
|
||||
| `model` | The binary model data. You usually don't want to exclude this. |
|
||||
| `tag_map` | The [tag map](/usage/adding-languages#tag-map) mapping fine-grained to coarse-grained tag. |
|
||||
| Name | Description |
|
||||
| ------- | -------------------------------------------------------------- |
|
||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||
| `cfg` | The config file. You usually don't want to exclude this. |
|
||||
| `model` | The binary model data. You usually don't want to exclude this. |
|
||||
|
|
|
@ -5,9 +5,20 @@ tag: class
|
|||
source: spacy/tokenizer.pyx
|
||||
---
|
||||
|
||||
> #### Default config
|
||||
>
|
||||
> ```ini
|
||||
> [nlp.tokenizer]
|
||||
> @tokenizers = "spacy.Tokenizer.v1"
|
||||
> ```
|
||||
|
||||
Segment text, and create `Doc` objects with the discovered segment boundaries.
|
||||
For a deeper understanding, see the docs on
|
||||
[how spaCy's tokenizer works](/usage/linguistic-features#how-tokenizer-works).
|
||||
The tokenizer is typically created automatically when the a
|
||||
[`Language`](/api/language) subclass is initialized and it reads its settings
|
||||
like punctuation and special case rules from the
|
||||
[`Language.Defaults`](/api/language#defaults) provided by the language subclass.
|
||||
|
||||
## Tokenizer.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@ menu:
|
|||
- ['spacy', 'spacy']
|
||||
- ['displacy', 'displacy']
|
||||
- ['registry', 'registry']
|
||||
- ['Readers & Batchers', 'readers-batchers']
|
||||
- ['Batchers', 'batchers']
|
||||
- ['Data & Alignment', 'gold']
|
||||
- ['Utility Functions', 'util']
|
||||
---
|
||||
|
@ -299,13 +299,14 @@ factories.
|
|||
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. |
|
||||
| `factories` | Registry for functions that create [pipeline components](/usage/processing-pipelines#custom-components). Added automatically when you use the `@spacy.component` decorator and also reads from [entry points](/usage/saving-loading#entry-points) |
|
||||
| `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. |
|
||||
| `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
||||
| `lookups` | Registry for large lookup tables available via `vocab.lookups`. |
|
||||
| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
||||
| `assets` | |
|
||||
| `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. |
|
||||
| `readers` | Registry for training and evaluation [data readers](#readers-batchers). |
|
||||
| `batchers` | Registry for training and evaluation [data batchers](#readers-batchers). |
|
||||
| `readers` | Registry for training and evaluation data readers like [`Corpus`](/api/corpus). |
|
||||
| `batchers` | Registry for training and evaluation [data batchers](#batchers). |
|
||||
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
|
||||
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
|
||||
| `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). |
|
||||
|
@ -337,42 +338,9 @@ See the [`Transformer`](/api/transformer) API reference and
|
|||
| [`span_getters`](/api/transformer#span_getters) | Registry for functions that take a batch of `Doc` objects and return a list of `Span` objects to process by the transformer, e.g. sentences. |
|
||||
| [`annotation_setters`](/api/transformer#annotation_setters) | Registry for functions that create annotation setters. Annotation setters are functions that take a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. |
|
||||
|
||||
## Data readers and batchers {#readers-batchers new="3"}
|
||||
## Batchers {#batchers source="spacy/gold/batchers.py" new="3"}
|
||||
|
||||
<!-- TODO: -->
|
||||
|
||||
### spacy.Corpus.v1 {#corpus tag="registered function" source="spacy/gold/corpus.py"}
|
||||
|
||||
Registered function that creates a [`Corpus`](/api/corpus) of training or
|
||||
evaluation data. It takes the same arguments as the `Corpus` class and returns a
|
||||
callable that yields [`Example`](/api/example) objects. You can replace it with
|
||||
your own registered function in the [`@readers` registry](#regsitry) to
|
||||
customize the data loading and streaming.
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [paths]
|
||||
> train = "corpus/train.spacy"
|
||||
>
|
||||
> [training.train_corpus]
|
||||
> @readers = "spacy.Corpus.v1"
|
||||
> path = ${paths:train}
|
||||
> gold_preproc = false
|
||||
> max_length = 0
|
||||
> limit = 0
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------- | ------ | ----------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | `Path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). |
|
||||
| `gold_preproc` | bool | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. |
|
||||
| `max_length` | int | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. |
|
||||
| `limit` | int | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. |
|
||||
|
||||
### Batchers {#batchers source="spacy/gold/batchers.py"}
|
||||
|
||||
<!-- TODO: -->
|
||||
<!-- TODO: intro and also describe signature of functions -->
|
||||
|
||||
#### batch_by_words.v1 {#batch_by_words tag="registered function"}
|
||||
|
||||
|
@ -446,28 +414,6 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
|
|||
|
||||
## Training data and alignment {#gold source="spacy/gold"}
|
||||
|
||||
### gold.docs_to_json {#docs_to_json tag="function"}
|
||||
|
||||
Convert a list of Doc objects into the
|
||||
[JSON-serializable format](/api/data-formats#json-input) used by the
|
||||
[`spacy train`](/api/cli#train) command. Each input doc will be treated as a
|
||||
'paragraph' in the output doc.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.gold import docs_to_json
|
||||
>
|
||||
> doc = nlp("I like London")
|
||||
> json_data = docs_to_json([doc])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | ------------------------------------------ |
|
||||
| `docs` | iterable / `Doc` | The `Doc` object(s) to convert. |
|
||||
| `id` | int | ID to assign to the JSON. Defaults to `0`. |
|
||||
| **RETURNS** | dict | The data in spaCy's JSON format. |
|
||||
|
||||
### gold.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"}
|
||||
|
||||
Encode labelled spans into per-token tags, using the
|
||||
|
|
|
@ -24,8 +24,6 @@ Create the vocabulary.
|
|||
| Name | Type | Description |
|
||||
| -------------------------------------------- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lex_attr_getters` | dict | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. |
|
||||
| `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. |
|
||||
| `lemmatizer` | object | A lemmatizer. Defaults to `None`. |
|
||||
| `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. |
|
||||
| `lookups` | `Lookups` | A [`Lookups`](/api/lookups) that stores the `lemma_\*`, `lexeme_norm` and other large lookup tables. Defaults to `None`. |
|
||||
| `lookups_extra` <Tag variant="new">2.3</Tag> | `Lookups` | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. |
|
||||
|
|
|
@ -1,85 +0,0 @@
|
|||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="931" height="456" viewBox="-1 -1 932 480" preserveAspectRatio="xMinYMin meet">
|
||||
<style>
|
||||
.svg__langdata__text-large, .svg__langdata__text-small, .svg__langdata__text-tiny {
|
||||
font-family: Arial, sans-serif;
|
||||
fill: #1a1e23
|
||||
}
|
||||
.svg__langdata__text-large { font-size: 20px }
|
||||
.svg__langdata__text-small, .svg__langdata__text-tiny { font-weight: bold; font-size: 15px; }
|
||||
</style>
|
||||
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M610 404h-69.8" stroke-dasharray="1 6" stroke-linecap="round"/>
|
||||
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M534.2 404l8-4-2 4 2 4z"/>
|
||||
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M642.7 361.3H708l33 43.6-33 43.5H643L610 405z"/>
|
||||
<text class="svg__langdata__text-large" transform="translate(630 410)" width="80" height="22">Tokenizer</text>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M780 303H621v-56.8"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M621 240.2l4 8-4-2-4 2z"/>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M855 253v-20.8"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M855 226.2l4 8-4-2-4 2z"/>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M780 303h-45v-56.8"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M735 240.2l4 8-4-2-4 2z"/>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M780 303H504v-56.8"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M504 240.2l4 8-4-2-4 2z"/>
|
||||
<ellipse cx="855" cy="303" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/>
|
||||
<text class="svg__langdata__text-large" transform="translate(815 308)" width="119" height="46">Base data</text>
|
||||
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100l.4 39.8"/>
|
||||
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389.5 145.8l-4-8 4 2 4-2z"/>
|
||||
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17h232v22.8"/>
|
||||
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M621 145.8l-4-8 4 2 4-2z"/>
|
||||
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17H280v22.8"/>
|
||||
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M280 145.8l-4-8 4 2 4-2z"/>
|
||||
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17h115v22.8"/>
|
||||
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M504 145.8l-4-8 4 2 4-2z"/>
|
||||
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17h346v22.8"/>
|
||||
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M735 145.8l-4-8 4 2 4-2z"/>
|
||||
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17H163v22.8"/>
|
||||
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M163 145.8l-4-8 4 2 4-2z"/>
|
||||
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17H46v22.8"/>
|
||||
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M46 145.8l-4-8 4 2 4-2z"/>
|
||||
<ellipse cx="389" cy="50" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
|
||||
<text class="svg__langdata__text-large" transform="translate(346.5 42)" width="81" height="46">Language <tspan dy="1.45em" dx="-3.7em">data</tspan></text>
|
||||
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M435 193h15.8"/>
|
||||
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M456.8 193l-8 4 2-4-2-4z"/>
|
||||
<ellipse cx="390" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
|
||||
<text class="svg__langdata__text-small" transform="translate(368 187.5)" width="39" height="30">stop <tspan dx="-2.8em" dy="1.25em">words</tspan></text>
|
||||
<path fill="none" stroke="#9673a6" stroke-width="3" stroke-miterlimit="10" d="M472 225l-1.5 133.8" stroke-dasharray="1 6" stroke-linecap="round"/>
|
||||
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M470.4 364.8l-4-8 4 2 4-2z"/>
|
||||
<ellipse cx="504" cy="193" fill="#f5f5f5" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
|
||||
<text class="svg__langdata__text-small" transform="translate(473 187.5)" width="85" height="30">lexical <tspan dx="-4em" dy="1.25em">attributes</tspan></text>
|
||||
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M653 225l5.6 127.8" stroke-dasharray="1 6" stroke-linecap="round"/>
|
||||
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M659 358.8l-4.5-8 4 2 4-2.2z"/>
|
||||
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M576 193h-18.8"/>
|
||||
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M551.2 193l8-4-2 4 2 4z"/>
|
||||
<ellipse cx="621" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
|
||||
<text class="svg__langdata__text-small" transform="translate(582 187.5)" width="85" height="30">tokenizer <tspan dx="-5.2em" dy="1.25em">exceptions</tspan></text>
|
||||
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M690 193h-15.8"/>
|
||||
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M668.2 193l8-4-2 4 2 4z"/>
|
||||
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M703 225l-10.3 127.8" stroke-dasharray="1 6" stroke-linecap="round"/>
|
||||
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M692.2 358.8l-3.4-8.3 4 2.3 4-1.7z"/>
|
||||
<ellipse cx="735" cy="193" fill="#f5f5f5" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
|
||||
<text class="svg__langdata__text-small" transform="translate(705 182)" width="53" height="46">prefixes, <tspan dy="1.25em" dx="-4.4em">suffixes,</tspan> <tspan dy="1.25em" dx="-4em">infixes</tspan>
|
||||
</text>
|
||||
<path fill="none" stroke="#d79b00" stroke-width="3" stroke-miterlimit="10" d="M280 238v114.8" stroke-dasharray="1 6" stroke-linecap="round"/>
|
||||
<path fill="#d79b00" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M280 358.8l-4-8 4 2 4-2z"/>
|
||||
<ellipse cx="280" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
|
||||
<text class="svg__langdata__text-small" transform="translate(254 187.5)" width="71" height="30">lemma <tspan dy="1.25em" dx="-3em">data</tspan></text>
|
||||
<path fill="none" stroke="#d79b00" stroke-width="3" stroke-miterlimit="10" d="M346 404h53.8" stroke-dasharray="1 6" stroke-linecap="round"/>
|
||||
<path fill="#d79b00" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M405.8 404l-8 4 2-4-2-4z"/>
|
||||
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M247.7 361.3H313l33 43.6-33 43.5h-65.3L215 405z"/>
|
||||
<text class="svg__langdata__text-large" transform="translate(228 410)" width="100" height="22">Lemmatizer</text>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M823 193h-34.8"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M782.2 193l8-4-2 4 2 4z"/>
|
||||
<ellipse cx="855" cy="193" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="31.5" ry="31.5"/>
|
||||
<text class="svg__langdata__text-tiny" transform="translate(829 189)" width="50" height="30">char <tspan dy="1.1em" dx="-3.1em">classes</tspan></text>
|
||||
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M408 367h124v74H408z"/>
|
||||
<text class="svg__langdata__text-large" transform="translate(443.5 410)" width="51" height="22">Token</text>
|
||||
<path fill="none" stroke="#666" stroke-width="3" stroke-miterlimit="10" d="M131 225l-21 122.2" stroke-dasharray="1 6" stroke-linecap="round"/>
|
||||
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M109 353l-2.5-8.5 3.6 2.7 4.4-1.3z"/>
|
||||
<ellipse cx="163" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
|
||||
<text class="svg__langdata__text-small" transform="translate(139 187.5)" width="45" height="30">morph <tspan dy="1.25em" dx="-3.2em">rules</tspan></text>
|
||||
<path fill="none" stroke="#666" stroke-width="3" stroke-miterlimit="10" d="M78 225l15.4 122" stroke-dasharray="1 6" stroke-linecap="round"/>
|
||||
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M94.2 353l-5-7.5 4.2 1.5 3.7-2.5z"/>
|
||||
<ellipse cx="46" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
|
||||
<text class="svg__langdata__text-small" transform="translate(33 187.5)" width="27" height="30">tag <tspan dy="1.25em" dx="-2em">map</tspan></text>
|
||||
<ellipse cx="101" cy="405" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.5" ry="49.5"/>
|
||||
<text class="svg__langdata__text-large" transform="translate(49.5 410)" width="100" height="22">Morphology</text>
|
||||
</svg>
|
Before Width: | Height: | Size: 9.1 KiB |
|
@ -1,123 +1,305 @@
|
|||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="600" height="380" viewBox="-20 -10 550 400">
|
||||
<style>
|
||||
.svg__tokenization__text { fill: #1a1e23; font: 18px Arial, sans-serif }
|
||||
.svg__tokenization__text-small { fill: #fff; font: 600 13px Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace }
|
||||
</style>
|
||||
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 39v12H16v11M71 39v12h20v11"/>
|
||||
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M1 1h140v38.2H1z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" width="43" height="19" transform="translate(48.5 9.5)">“Let’s</text>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 39v23"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 1h50v38.2h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 9.5)" width="19" height="19">go</text>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 39v23"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 1h50v38.2h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 9.5)" width="15" height="19">to</text>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 39v23"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 1h141v38.2H270z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 9.5)" width="38" height="19">N.Y.!”</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 100v20"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 62h30v38.2H1z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 70.5)" width="7" height="19">“</text>
|
||||
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M91 100v11H66v9M91 100v11h29v9"/>
|
||||
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M41 62h100v38.2H41z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(72.5 70.5)" width="35" height="19">Let’s</text>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 100v20"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 62h50v38.2h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 70.5)" width="19" height="19">go</text>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 100v20"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 62h50v38.2h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 70.5)" width="15" height="19">to</text>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 100v20"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 62h141v38.2H270z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 70.5)" width="38" height="19">N.Y.!”</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 120h30v38H1z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 128.5)" width="7" height="19">“</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 158v24"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 120h50v38H41z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 128.5)" width="23" height="19">Let</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 158v24"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 120h50v38h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 128.5)" width="19" height="19">go</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 158v24"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 120h50v38h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 128.5)" width="15" height="19">to</text>
|
||||
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M341 158v13h-20v11M341 158v13h55v11"/>
|
||||
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 120h141v38H270z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 128.5)" width="38" height="19">N.Y.!”</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 158v24"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 120h40v38h-40z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 128.5)" width="11" height="19">’s</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 220v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 181.8h30V220H1z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 190.5)" width="7" height="19">“</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 220v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 181.8h50V220H41z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 190.5)" width="23" height="19">Let</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 220v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 181.8h50V220h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 190.5)" width="19" height="19">go</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 220v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 181.8h50V220h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 190.5)" width="15" height="19">to</text>
|
||||
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M321 220v11h-20v12M321 220v11h34v12"/>
|
||||
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 181.8h101V220H270z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(304.5 190.5)" width="30" height="19">N.Y.!</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 220v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 181.8h40V220h-40z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 190.5)" width="11" height="19">’s</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 220v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 181.8h30V220h-30z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 190.5)" width="7" height="19">”</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 281v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 242.7h30V281H1z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 251.5)" width="7" height="19">“</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 281v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 242.7h50V281H41z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 251.5)" width="23" height="19">Let</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 281v20-17 20"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 242.7h50V281h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 251.5)" width="19" height="19">go</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 281v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 242.7h50V281h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 251.5)" width="15" height="19">to</text>
|
||||
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M301 281v23"/>
|
||||
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" d="M270 242.7h61V281h-61z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(286.5 251.5)" width="26" height="19">N.Y.</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 281v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 242.7h40V281h-40z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 251.5)" width="11" height="19">’s</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 281v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 242.7h30V281h-30z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 251.5)" width="7" height="19">”</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M355 281v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 242.7h30V281h-30z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(351.5 251.5)" width="5" height="19">!</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 304h30v38H1z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 312.5)" width="7" height="19">“</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 304h50v38H41z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 312.5)" width="23" height="19">Let</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 304h50v38h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 312.5)" width="19" height="19">go</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 304h50v38h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 312.5)" width="15" height="19">to</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M270 304h61v38h-61z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(286.5 312.5)" width="26" height="19">N.Y.</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 304h40v38h-40z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 312.5)" width="11" height="19">’s</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 304h30v38h-30z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 312.5)" width="7" height="19">”</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 304h30v38h-30z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(351.5 312.5)" width="5" height="19">!</text>
|
||||
<rect width="104" height="19" x="437" y="72" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 74.5)" width="65" height="12">EXCEPTION</text>
|
||||
<rect width="104" height="19" x="437" y="11" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 13.5)" width="43" height="12">PREFIX</text>
|
||||
<rect width="104" height="19" x="437" y="130" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 132.5)" width="43" height="12">SUFFIX</text>
|
||||
<rect width="104" height="19" x="437" y="191" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 193.5)" width="43" height="12">SUFFIX</text>
|
||||
<rect width="104" height="19" x="437" y="252" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 254.5)" width="65" height="12">EXCEPTION</text>
|
||||
<rect width="104" height="19" x="437" y="313" fill="#82b366" stroke="#82b366" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(473.5 315.5)" width="29" height="12">DONE</text>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="598" height="386" viewBox="0 0 598 386">
|
||||
<defs>
|
||||
<path id="a" d="M51.3 10.9a4.3 4.3 0 01-.6-2.2c0-.6.2-1.2.5-1.9a6 6 0 011.4-1.6l.6.4.1.2-.1.3a7.5 7.5 0 00-.6 1l-.2.5a2.5 2.5 0 000 1.4l.3.8.1.2c0 .2 0 .3-.3.4l-1.2.5zm3.4 0a4.3 4.3 0 01-.7-2.2c0-.6.2-1.2.5-1.9A6 6 0 0156 5.2l.6.4h.1v.5a7.5 7.5 0 00-.7 1l-.2.5a2.5 2.5 0 000 1.4l.4.8v.2c0 .2 0 .3-.2.4l-1.2.5zm7.4 9.3H69V22h-9V6.2h2.2v14zM75 10.7a5 5 0 011.9.3 4.1 4.1 0 012.4 2.5c.2.7.3 1.4.3 2.2v.6l-.4.1h-7.5c0 .7.1 1.4.3 1.9.2.5.4 1 .7 1.3l1.1.8 1.5.2c.5 0 .9 0 1.2-.2a6 6 0 001.6-.7l.5-.2.3.2.6.7-.9.8-1 .5a6.9 6.9 0 01-4.6 0c-.7-.2-1.2-.6-1.7-1-.5-.6-.8-1.2-1.1-2a7.6 7.6 0 010-4.7 4.7 4.7 0 012.7-3c.6-.2 1.3-.3 2.1-.3zm0 1.4a3 3 0 00-2.2.8c-.5.6-.9 1.3-1 2.3h6l-.1-1.2c-.1-.4-.3-.7-.6-1-.2-.3-.5-.5-.9-.7a3 3 0 00-1.2-.2zm10.5 10c-.9 0-1.6-.2-2-.7-.5-.5-.7-1.2-.7-2v-6.9h-1.4l-.3-.1-.1-.3v-.8l1.8-.2.5-3.5.1-.3h1.3V11H88v1.4h-3.2v6.7c0 .5.1.8.4 1 .2.3.5.4.8.4l.6-.1a2.3 2.3 0 00.6-.4h.2l.3.1.6 1c-.3.3-.8.5-1.2.7l-1.5.3zm6.2-16.7a4.1 4.1 0 01.6 2.1 4 4 0 01-.5 2 6 6 0 01-1.3 1.6l-.6-.4-.2-.1v-.1l.1-.3a5.1 5.1 0 00.7-1l.2-.5a2.5 2.5 0 000-1.4l-.4-.8v-.2c0-.2 0-.3.2-.4l1.2-.5zm9.7 7.3c-.1.2-.3.2-.4.2h-.4a8.6 8.6 0 00-1.2-.6 3.4 3.4 0 00-2 0l-.6.3-.4.5-.2.7c0 .2.1.5.3.7l.6.5 1 .3a49.6 49.6 0 013 1.3l.7.8.2 1.2c0 .5 0 1-.3 1.4-.2.5-.4.8-.8 1.1a4 4 0 01-1.3.8c-.5.2-1.1.3-1.8.3a5.6 5.6 0 01-3.7-1.4l.4-.7.2-.2.4-.1.4.1.5.4.8.3 1 .2c.5 0 .8 0 1-.2.4 0 .6-.2.8-.4l.4-.6.2-.7c0-.3-.1-.5-.3-.7a2 2 0 00-.6-.6l-1-.3a68.4 68.4 0 01-2.1-.8l-1-.5-.6-.9c-.2-.3-.2-.7-.2-1.2a3 3 0 011-2.2c.3-.3.7-.6 1.2-.8a5 5 0 011.7-.2c.8 0 1.4.1 2 .3l1.5 1-.4.7z"/>
|
||||
<path id="b" d="M183.5 10.7l1.4.1 1.1.5h3v.7c0 .3 0 .4-.4.5l-1.3.2c.3.4.4 1 .4 1.6a3.4 3.4 0 01-1.2 2.6c-.3.3-.8.5-1.3.7a5.6 5.6 0 01-3.2 0l-.5.5-.2.5c0 .3.1.5.4.6.2.2.5.3.8.3l1.2.1a36.1 36.1 0 012.7.3c.5 0 .9.2 1.2.4.4.2.7.4.9.7a3 3 0 010 2.6c-.3.5-.6 1-1 1.3-.5.3-1 .6-1.7.8a8.4 8.4 0 01-4.3 0 5 5 0 01-1.6-.6c-.4-.2-.7-.6-.9-1-.2-.3-.3-.6-.3-1 0-.6.2-1 .5-1.4.4-.4.9-.7 1.5-1a2 2 0 01-.8-.5c-.2-.3-.3-.6-.3-1l.1-.5c0-.2.2-.4.3-.5a2.9 2.9 0 011-1c-.5-.2-1-.6-1.2-1.2-.3-.5-.5-1-.5-1.7a3.3 3.3 0 011.2-2.6 4 4 0 011.3-.8l1.7-.2zm3.5 12c0-.4 0-.6-.2-.8l-.7-.4-.9-.2a13.9 13.9 0 00-2.2-.1l-1.2-.1-1 .7a1.5 1.5 0 00-.2 1.7l.6.6c.3.1.6.3 1 .3l1.4.2c.6 0 1 0 1.4-.2.5 0 .8-.2 1.1-.4.3-.1.5-.3.7-.6l.2-.8zm-3.5-6.1l1-.2c.4-.1.6-.3.8-.5l.5-.7.2-.9c0-.7-.2-1.2-.7-1.6-.4-.4-1-.6-1.8-.6s-1.4.2-1.8.6c-.4.4-.6 1-.6 1.6 0 .3 0 .6.2 1a2 2 0 001.2 1c.3.2.6.3 1 .3zm12-6c.8 0 1.6.2 2.2.5a4.7 4.7 0 012.8 3c.2.7.3 1.5.3 2.3 0 .9 0 1.7-.3 2.4s-.6 1.3-1 1.8c-.6.5-1.1.9-1.8 1.2-.6.2-1.4.4-2.2.4-.8 0-1.5-.2-2.2-.4-.6-.3-1.2-.7-1.7-1.2-.4-.5-.8-1.1-1-1.8a7 7 0 01-.4-2.4c0-.8.1-1.6.4-2.3.2-.8.6-1.4 1-1.9.5-.5 1-.8 1.7-1.1.7-.3 1.4-.4 2.2-.4zm0 10c1.1 0 2-.3 2.5-1 .5-.8.8-1.8.8-3.2 0-1.3-.3-2.3-.8-3-.5-.8-1.4-1.2-2.5-1.2-.5 0-1 .1-1.4.3-.4.2-.8.5-1 .8l-.7 1.4-.2 1.7.2 1.8.6 1.3c.3.4.7.6 1 .8l1.5.3z"/>
|
||||
<path id="c" d="M250.4 22.2c-.8 0-1.5-.3-2-.8s-.7-1.2-.7-2v-6.9h-1.3l-.3-.1-.2-.3v-.8l1.9-.2.4-3.5c0-.1 0-.2.2-.3h1.3V11h3.2v1.4h-3.2v6.7c0 .5 0 .8.3 1 .2.3.5.4.9.4l.5-.1a2.3 2.3 0 00.7-.4h.2l.3.1.5 1a4.1 4.1 0 01-2.7 1zm9.4-11.5c.8 0 1.5.1 2.2.4a4.7 4.7 0 012.7 3 7.2 7.2 0 010 4.7c-.2.7-.6 1.3-1 1.8-.5.5-1 .9-1.7 1.2-.7.2-1.4.4-2.2.4-.8 0-1.6-.2-2.2-.4-.7-.3-1.2-.7-1.7-1.2s-.8-1.1-1-1.8a7 7 0 01-.4-2.4c0-.8 0-1.6.3-2.3a4.7 4.7 0 012.7-3c.7-.3 1.5-.4 2.3-.4zm0 10c1 0 2-.4 2.4-1.2.6-.7.9-1.7.9-3 0-1.4-.3-2.4-.9-3.2-.5-.7-1.3-1-2.4-1-.6 0-1 0-1.5.2l-1 .8c-.3.4-.5.8-.6 1.4l-.2 1.7c0 .7 0 1.3.2 1.8.1.5.3 1 .6 1.3.3.4.6.6 1 .8.4.2 1 .3 1.5.3z"/>
|
||||
<path id="d" d="M347.6 6.2l.5.1.3.3 9.1 11.9a7.5 7.5 0 010-1.1V6.2h1.8V22h-1a1 1 0 01-.5 0 1 1 0 01-.3-.4l-9.1-11.9a14.1 14.1 0 010 1V22h-1.9V6.2h1.1zm14.6 14.6a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4l.5.1c.2 0 .4.2.5.3a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .4 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.4-1zm10-5V22h-2v-6.3l-5.9-9.5h2c.1 0 .3 0 .4.2l.3.3 3.6 6.2a7.6 7.6 0 01.6 1.4 13 13 0 01.6-1.4l3.6-6.2.3-.3.4-.2h2l-5.9 9.5zm5.2 5a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4l.5.1c.2 0 .3.2.4.3a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .4 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.3-1zm8.4-14.6v6.3a27.8 27.8 0 01-.2 4h-1.4a66.4 66.4 0 01-.2-4V6.2h1.8zm-2.3 14.6a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4l.5.1c.2 0 .3.2.4.3a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .4 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.3-1zm8.1-15.4a4.1 4.1 0 01.6 2.1 4 4 0 01-.5 2 6 6 0 01-1.3 1.6l-.6-.4-.2-.1v-.1l.1-.3a5.1 5.1 0 00.7-1l.2-.5a2.5 2.5 0 000-1.4l-.4-.8v-.2c0-.2 0-.3.2-.4l1.2-.5zm3.4 0a4.1 4.1 0 01.6 2.1 4 4 0 01-.5 2 6 6 0 01-1.4 1.6l-.6-.4-.1-.1v-.1-.3a5.1 5.1 0 00.7-1l.2-.5a2.5 2.5 0 000-1.4l-.4-.8v-.2c0-.2 0-.3.3-.4l1.2-.5z"/>
|
||||
<path id="e" d="M13.5 77.8c-.5-.8-.8-1.6-.8-2.5 0-.7.2-1.4.6-2 .3-.7.9-1.4 1.6-2l.8.6.2.2V72.4l-.1.2a5.7 5.7 0 00-.6.8l-.2.6-.1.7.1.8c0 .3.2.5.4.9l.1.3c0 .2-.1.4-.4.5l-1.6.6zm3.6 0c-.5-.8-.7-1.6-.7-2.5 0-.7.2-1.4.5-2 .4-.7 1-1.4 1.6-2l.9.6.1.2v.5a5.7 5.7 0 00-.7.8l-.2.6v1.5l.5.9v.3c0 .2 0 .4-.3.5l-1.7.6z"/>
|
||||
<path id="f" d="M80.8 86.8h6.8v1.7h-9V72.8h2.2v14zm12.9-9.6a5 5 0 011.8.4A4.1 4.1 0 0198 80c.2.6.3 1.3.3 2.1l-.1.6-.4.2h-7.4c0 .7.1 1.3.3 1.8.2.5.4 1 .7 1.3.3.4.7.6 1.1.8l1.5.3 1.2-.2a6 6 0 001.6-.7l.4-.2c.2 0 .3 0 .4.2l.6.7-1 .7c-.2.3-.6.4-1 .6a6.9 6.9 0 01-4.5 0c-.7-.3-1.2-.6-1.7-1.1-.5-.5-.9-1.2-1.1-1.9a7.6 7.6 0 010-4.7c.2-.7.5-1.3 1-1.8.4-.5 1-.9 1.6-1.2.6-.2 1.4-.4 2.2-.4zm0 1.5a3 3 0 00-2.2.8c-.5.5-.9 1.3-1 2.3h6l-.1-1.3-.6-1c-.2-.2-.5-.5-.9-.6a3 3 0 00-1.2-.2zm10.5 10c-.9 0-1.6-.2-2-.7-.5-.5-.8-1.2-.8-2.1V79h-1.6l-.1-.4v-.8l1.8-.2.5-3.4.1-.3.3-.1h1v3.8h3.2V79h-3.2v6.7c0 .5.1.8.3 1 .3.3.6.4 1 .4h.4a2.3 2.3 0 00.7-.4l.2-.1c.1 0 .2 0 .3.2l.6 1-1.3.7-1.4.2zm6.2-16.7a4.1 4.1 0 01.6 2 4 4 0 01-.5 2 6 6 0 01-1.4 1.6l-.6-.3V77h-.1l.1-.3a5.1 5.1 0 00.6-1l.2-.6a2.5 2.5 0 000-1.3c0-.3-.2-.6-.3-.8l-.1-.3c0-.1 0-.3.3-.4l1.2-.4zm9.6 7.2c0 .2-.2.3-.4.3l-.3-.1a8.6 8.6 0 00-1.3-.6 3.4 3.4 0 00-1.8 0l-.7.4-.5.5-.1.6c0 .3 0 .5.2.7l.7.5 1 .4a49.6 49.6 0 013 1.3c.2.2.5.5.6.8.2.3.3.7.3 1.1l-.3 1.5c-.2.4-.5.8-.8 1a4 4 0 01-1.3.8l-1.8.3a5.6 5.6 0 01-3.8-1.3l.5-.8.2-.2h.8l.5.4.7.4 1.2.1 1-.1.7-.4.4-.6.1-.7c0-.3 0-.6-.2-.8a2 2 0 00-.7-.5l-.9-.4a68.4 68.4 0 01-2.1-.7l-1-.6-.6-.8-.3-1.2a3 3 0 011-2.3l1.3-.7a5 5 0 011.7-.3c.7 0 1.4.1 2 .4.6.2 1 .5 1.5 1l-.5.6z"/>
|
||||
<path id="g" d="M182.2 77.2c.4 0 .9 0 1.3.2.4 0 .8.2 1.2.4h3v.8c0 .2-.2.4-.5.4l-1.2.2c.2.5.3 1 .3 1.6a3.4 3.4 0 01-1.1 2.6c-.4.3-.8.6-1.4.7-.5.2-1 .3-1.6.3-.6 0-1 0-1.5-.2l-.5.5c-.2.2-.2.3-.2.5s0 .4.3.6l.8.3h1.2a36.1 36.1 0 012.8.3l1.2.4.8.8c.2.3.3.7.3 1.2s0 1-.3 1.4c-.3.5-.6.9-1 1.2a7 7 0 01-3.8 1.1c-.9 0-1.6 0-2.2-.2a5 5 0 01-1.5-.6l-1-1-.2-1c0-.6.1-1.1.5-1.5.3-.4.8-.7 1.4-1a2 2 0 01-.7-.5c-.2-.2-.3-.6-.3-1v-.5l.3-.5a2.9 2.9 0 011.1-.9c-.5-.3-1-.7-1.3-1.2-.3-.5-.5-1.1-.5-1.8 0-.5.1-1 .4-1.5l.8-1.1a4 4 0 011.4-.7c.5-.2 1-.3 1.7-.3zm3.4 12c0-.3 0-.6-.2-.7-.1-.2-.4-.3-.6-.4l-1-.2a13.9 13.9 0 00-2.2-.2h-1.1c-.4.1-.8.4-1 .6a1.5 1.5 0 00-.2 1.8c.1.2.3.4.6.5.2.2.6.3 1 .4l1.4.1 1.4-.1c.4-.1.8-.2 1-.4l.7-.6c.2-.3.2-.6.2-.8zm-3.4-6.1c.4 0 .7 0 1-.2.3 0 .6-.2.8-.4l.4-.7.2-1c0-.6-.2-1.2-.6-1.6-.4-.4-1-.6-1.8-.6s-1.4.2-1.8.6a2.5 2.5 0 00-.5 2.5 2 2 0 001.2 1.2l1 .2zm12-5.9c.8 0 1.5.2 2.2.4a4.7 4.7 0 012.7 3c.2.7.4 1.5.4 2.4 0 .8-.2 1.6-.4 2.3-.2.7-.6 1.3-1 1.8-.5.5-1 1-1.7 1.2-.7.3-1.4.4-2.2.4-.8 0-1.6-.1-2.2-.4-.7-.3-1.3-.7-1.7-1.2-.5-.5-.8-1-1-1.8a7 7 0 01-.5-2.3c0-.9.2-1.7.4-2.4.3-.7.6-1.3 1-1.8.5-.5 1.1-.9 1.8-1.2.6-.2 1.4-.4 2.2-.4zm0 10c1 0 1.9-.4 2.4-1.1.6-.8.8-1.8.8-3.1s-.2-2.4-.8-3.1c-.5-.8-1.3-1.1-2.4-1.1-.6 0-1 0-1.5.3-.4.1-.7.4-1 .8-.3.3-.5.8-.6 1.3-.2.5-.2 1.1-.2 1.8 0 .6 0 1.2.2 1.8.1.5.3 1 .6 1.3l1 .8c.4.2 1 .3 1.5.3z"/>
|
||||
<path id="h" d="M249 88.7c-1 0-1.6-.2-2-.7-.6-.5-.8-1.2-.8-2.1V79h-1.6l-.2-.4v-.8l1.9-.2.4-3.4c0-.2 0-.2.2-.3l.3-.1h1v3.8h3.2V79h-3.2v6.7c0 .5 0 .8.3 1 .2.3.5.4.9.4h.5a2.3 2.3 0 00.7-.4l.2-.1c.1 0 .2 0 .3.2l.5 1-1.2.7-1.5.2zm9.3-11.5c.8 0 1.5.2 2.2.4a4.7 4.7 0 012.7 3c.3.7.4 1.5.4 2.4 0 .8-.1 1.6-.4 2.3-.2.7-.6 1.3-1 1.8-.5.5-1 1-1.7 1.2-.7.3-1.4.4-2.2.4-.8 0-1.6-.1-2.2-.4-.7-.3-1.2-.7-1.7-1.2s-.8-1-1-1.8a7 7 0 01-.4-2.3c0-.9 0-1.7.3-2.4s.6-1.3 1.1-1.8c.5-.5 1-.9 1.7-1.2.6-.2 1.4-.4 2.2-.4zm0 10c1 0 2-.4 2.4-1.1.6-.8.9-1.8.9-3.1s-.3-2.4-.9-3.1c-.5-.8-1.3-1.1-2.4-1.1-.6 0-1 0-1.5.3-.4.1-.7.4-1 .8-.3.3-.5.8-.6 1.3L255 83c0 .6 0 1.2.2 1.8.1.5.3 1 .6 1.3l1 .8c.4.2 1 .3 1.5.3z"/>
|
||||
<path id="i" d="M347.2 72.8h.5l.3.3 9.1 12a7.5 7.5 0 010-1.2V72.8h1.8v15.7h-1a1 1 0 01-.5 0 1 1 0 01-.3-.3l-9.1-12a14.1 14.1 0 010 1.1v11.2h-1.9V72.8h1.1zm14.6 14.5a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4c.2 0 .4 0 .5.2.2 0 .3.1.5.3a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .3 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.4-1zm10-5v6.2h-2v-6.2l-5.9-9.5h2l.4.1.2.4 3.7 6.1a7.6 7.6 0 01.6 1.4 13 13 0 01.6-1.4l3.6-6.1.3-.4.4-.1h2l-5.9 9.5zm5.2 5a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4c.1 0 .3 0 .5.2.2 0 .3.1.4.3a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .3 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.3-1zm8.4-14.5V79a27.8 27.8 0 01-.3 4h-1.3a66.4 66.4 0 01-.3-4v-6.3h2zm-2.3 14.5a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4c.1 0 .3 0 .5.2l.4.3a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .3 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.3-1zm8.1-15.3a4.1 4.1 0 01.6 2 4 4 0 01-.5 2 6 6 0 01-1.3 1.6l-.6-.3-.2-.2.1-.3a5.1 5.1 0 00.7-1l.2-.6a2.5 2.5 0 000-1.3l-.4-.8v-.3c0-.1 0-.3.2-.4l1.2-.4zm3.3 0a4.1 4.1 0 01.7 2 4 4 0 01-.5 2 6 6 0 01-1.4 1.6l-.6-.3-.1-.2v-.3a5.1 5.1 0 00.7-1l.2-.6a2.5 2.5 0 000-1.3c0-.3-.2-.6-.4-.8v-.3c0-.1 0-.3.2-.4l1.2-.4z"/>
|
||||
<path id="j" d="M13.5 141c-.5-.7-.8-1.6-.8-2.4 0-.7.2-1.4.6-2.1.3-.7.9-1.3 1.6-1.8l.8.5.2.1v.4l-.1.2a5.7 5.7 0 00-.6.8l-.2.6-.1.6.1.8c0 .3.2.6.4 1l.1.2c0 .3-.1.4-.4.5l-1.6.7zm3.6 0c-.5-.7-.7-1.6-.7-2.4 0-.7.2-1.4.5-2.1.4-.7 1-1.3 1.6-1.8l.9.5.1.1V136a5.7 5.7 0 00-.7.8l-.2.6v1.4l.5 1v.2c0 .3 0 .4-.3.5l-1.7.7z"/>
|
||||
<path id="k" d="M60 149.4h6.3v2.4h-9.4V136h3v13.5zm12.4-9c.7 0 1.4 0 2 .3a4.3 4.3 0 012.5 2.6 6 6 0 01.4 2.7l-.1.3-.2.2h-7.3c0 1.2.4 2 1 2.6a3 3 0 002 .8c.5 0 1 0 1.2-.2l.9-.3.6-.4.5-.1h.3l.2.2.8 1-1 1a5.7 5.7 0 01-2.4.8h-1.3a6 6 0 01-2.1-.3c-.7-.3-1.3-.7-1.8-1.2s-.9-1.1-1.2-1.9a7.3 7.3 0 010-4.7c.2-.7.6-1.3 1-1.8a5 5 0 011.7-1.2c.7-.3 1.5-.4 2.3-.4zm0 1.9c-.7 0-1.3.2-1.8.7-.4.4-.7 1-.8 1.9h5v-1l-.5-.8a2 2 0 00-.8-.6l-1-.2zm10.8 9.7c-1 0-1.7-.3-2.2-.8-.6-.6-.8-1.4-.8-2.3v-6.3H79c-.1 0-.3 0-.4-.2l-.1-.4v-1l1.8-.4.6-3c0-.2 0-.3.2-.4l.4-.1h1.4v3.5h3v2h-3v6c0 .4 0 .7.2 1l.8.2h.4a2.3 2.3 0 00.5-.3h.4l.2.2.8 1.3c-.4.3-.9.6-1.4.7a5 5 0 01-1.6.3z"/>
|
||||
<path id="l" d="M183 140.3c.4 0 .9 0 1.3.2.4 0 .8.2 1.2.4h3.2v1l-.1.4c0 .1-.2.2-.5.2l-1 .2a3.5 3.5 0 01.3 1.3 3.4 3.4 0 01-1.3 2.6c-.4.4-.9.6-1.4.8a5.7 5.7 0 01-3 .1c-.3.2-.5.5-.5.7 0 .3 0 .4.3.5l.8.3h1.2a24.2 24.2 0 012.6.3l1.2.4.8.8c.2.4.3.8.3 1.4 0 .5 0 1-.3 1.4l-1 1.3-1.8.9a8.9 8.9 0 01-4.5 0c-.7-.1-1.2-.3-1.6-.6l-1-1-.2-1c0-.6.1-1 .4-1.4.4-.4.8-.7 1.4-.9-.3-.1-.5-.3-.7-.6-.2-.3-.2-.6-.2-1v-.5l.3-.6.5-.5.6-.4a3.3 3.3 0 01-1.8-3 3.4 3.4 0 011.2-2.7c.4-.3 1-.5 1.5-.7a6 6 0 011.8-.3zm3 12c0-.2-.1-.4-.3-.5 0-.2-.3-.3-.5-.3a14.7 14.7 0 00-2.8-.3l-1-.1c-.4.1-.6.3-.8.6-.2.2-.3.5-.3.8 0 .2 0 .3.2.5 0 .2.2.3.4.5l.9.2 1.2.2c.5 0 1 0 1.3-.2.4 0 .7-.1 1-.3l.5-.5.1-.6zm-3-6.4l.8-.1.7-.4.3-.6.2-.7c0-.6-.2-1-.5-1.4-.4-.3-.9-.5-1.5-.5-.7 0-1.2.2-1.5.5-.4.4-.5.8-.5 1.4v.7a1.6 1.6 0 001 1l1 .1zm12.3-5.5c.8 0 1.6 0 2.2.4a5 5 0 013 3c.2.7.3 1.5.3 2.4a7 7 0 01-.4 2.4 4.9 4.9 0 01-2.9 3 6.2 6.2 0 01-4.6 0 5 5 0 01-2.8-3c-.3-.7-.4-1.6-.4-2.4 0-1 0-1.7.4-2.5.2-.7.6-1.3 1-1.8a5 5 0 011.9-1.1c.6-.3 1.4-.4 2.3-.4zm0 9.5c.9 0 1.6-.3 2-1 .5-.6.7-1.5.7-2.7 0-1.2-.2-2.2-.7-2.8-.4-.6-1.1-1-2-1-1 0-1.7.4-2.2 1-.4.6-.6 1.6-.6 2.8 0 1.2.2 2.1.6 2.7.5.7 1.2 1 2.2 1z"/>
|
||||
<path id="m" d="M251.3 152c-1 0-1.7-.3-2.2-.8-.6-.6-.8-1.4-.8-2.3v-6.3H247c-.1 0-.2 0-.3-.2l-.2-.4v-1l1.8-.4.6-3c0-.2 0-.3.2-.4l.4-.1h1.4v3.5h3v2h-3v6c0 .4 0 .7.3 1l.7.2h.4a2.3 2.3 0 00.5-.3h.4l.2.2.8 1.3c-.4.3-.9.6-1.4.7a5 5 0 01-1.6.3zm9.7-11.6c.8 0 1.6 0 2.2.4a5 5 0 013 3c.2.7.3 1.5.3 2.4a7 7 0 01-.4 2.4 4.9 4.9 0 01-2.9 3 6.2 6.2 0 01-4.6 0 5 5 0 01-2.8-3c-.3-.7-.4-1.6-.4-2.4 0-1 0-1.7.4-2.5.2-.7.6-1.3 1-1.8a5 5 0 011.9-1.1c.6-.3 1.4-.4 2.3-.4zm0 9.5c.9 0 1.6-.3 2-1 .5-.6.7-1.5.7-2.7 0-1.2-.2-2.2-.7-2.8-.4-.6-1.1-1-2-1-1 0-1.7.4-2.2 1-.4.6-.6 1.6-.6 2.8 0 1.2.2 2.1.6 2.7.5.7 1.2 1 2.2 1z"/>
|
||||
<path id="n" d="M347.7 136l.5.1.3.3 9.1 11.9a7.5 7.5 0 010-1V136h1.8v15.7h-1a1 1 0 01-.5 0 1 1 0 01-.3-.4l-9.1-11.8a14.1 14.1 0 010 1v11.2h-1.9v-15.7h1.1zm14.6 14.6a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4l.5.1.5.3a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .4 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.4-1zm10-5v6.2h-2v-6.3l-5.9-9.4h2l.4.1.2.4 3.7 6a7.6 7.6 0 01.6 1.5 13 13 0 01.6-1.4l3.6-6.1c0-.2.2-.3.3-.4l.4-.1h2l-5.9 9.4zm5.2 5a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4l.5.1.4.3a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .4 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.3-1zm8.4-14.5v6.2a27.8 27.8 0 01-.3 4h-1.3a66.4 66.4 0 01-.3-4v-6.2h2zm-2.3 14.5a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4l.5.1.4.3a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .4 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.3-1zm8.1-15.4a4.1 4.1 0 01.6 2.1 4 4 0 01-.5 2 6 6 0 01-1.3 1.6l-.6-.4h-.2v-.2l.1-.3a5.1 5.1 0 00.7-1l.2-.5a2.5 2.5 0 000-1.4l-.4-.8v-.2c0-.2 0-.3.2-.4l1.2-.5zm3.3 0a4.1 4.1 0 01.7 2.1 4 4 0 01-.5 2 6 6 0 01-1.4 1.6l-.6-.4h-.1v-.2-.3a5.1 5.1 0 00.7-1l.2-.5a2.5 2.5 0 000-1.4l-.4-.8v-.2c0-.2 0-.3.2-.4l1.2-.5z"/>
|
||||
<path id="o" d="M127 135l.6 1.2a4.3 4.3 0 01-.4 3.3c-.4.7-.9 1.3-1.6 1.9l-.8-.5-.2-.2v-.2l.1-.3a6.5 6.5 0 00.6-.9 2.9 2.9 0 00.3-1.2l-.1-.8c0-.3-.2-.6-.4-.9l-.1-.3c0-.2.1-.4.4-.5l1.6-.6zm9.7 7.7l-.2.3h-.3a29 29 0 00-1.6-.6h-1a2 2 0 00-1.2.3 1 1 0 00-.5.9l.3.6.6.4.9.3a29 29 0 012 .8l.9.5.6.9c.2.3.3.7.3 1.1 0 .6-.1 1-.3 1.5-.2.5-.5.9-.9 1.2a4 4 0 01-1.4.8 6.1 6.1 0 01-3 .2 6.7 6.7 0 01-2.1-.7l-.8-.6.7-1c0-.2.1-.2.3-.3l.4-.1.4.1a10.6 10.6 0 001.3.6l1 .2.8-.1.6-.3.3-.5.1-.5c0-.2 0-.5-.2-.6a2 2 0 00-.6-.5 29.4 29.4 0 01-3-1l-.9-.6-.6-1c-.2-.3-.2-.7-.2-1.2a3.3 3.3 0 011-2.4 4 4 0 011.4-.8c.5-.2 1.1-.2 1.8-.2a4.8 4.8 0 013.7 1.4l-.6 1z"/>
|
||||
<path id="p" d="M13.5 208.7c-.5-.8-.8-1.6-.8-2.5 0-.7.2-1.4.6-2 .3-.7.9-1.3 1.6-2l.8.6.2.2V203.3l-.1.2a5.7 5.7 0 00-.6.9l-.2.5-.1.7.1.8c0 .3.2.6.4.9l.1.3c0 .2-.1.4-.4.5l-1.6.6zm3.6 0c-.5-.8-.7-1.6-.7-2.5 0-.7.2-1.4.5-2 .4-.7 1-1.3 1.6-2l.9.6.1.2v.5a5.7 5.7 0 00-.7.9l-.2.5v1.5l.5.9v.3c0 .2 0 .4-.3.5l-1.7.6z"/>
|
||||
<path id="q" d="M60 217h6.3v2.5h-9.4v-16h3V217zm12.4-9c.7 0 1.4.1 2 .3a4.3 4.3 0 012.5 2.6 6 6 0 01.4 2.7l-.1.3-.2.2h-7.3c0 1.2.4 2 1 2.6a3 3 0 002 .8l1.2-.1.9-.4.6-.3.5-.2h.3l.2.3.8 1-1 .9a5.7 5.7 0 01-2.4.8l-1.3.1a6 6 0 01-2.1-.4c-.7-.2-1.3-.6-1.8-1.1-.5-.5-.9-1.2-1.2-2a7.3 7.3 0 010-4.7c.2-.7.6-1.3 1-1.8a5 5 0 011.7-1.2c.7-.3 1.5-.4 2.3-.4zm0 2c-.7 0-1.3.2-1.8.6-.4.5-.7 1-.8 2h5v-1l-.5-.9a2 2 0 00-.8-.6l-1-.2zm10.8 9.6c-1 0-1.7-.2-2.2-.8-.6-.6-.8-1.3-.8-2.3v-6.3H79l-.4-.1-.1-.5v-1l1.8-.3.6-3.1c0-.2 0-.3.2-.4H82.9v3.5h3v1.9h-3v6.1c0 .4 0 .6.2.8.2.2.5.3.8.3h.4a2.3 2.3 0 00.5-.3h.4l.2.2.8 1.3c-.4.4-.9.6-1.4.8a5 5 0 01-1.6.2z"/>
|
||||
<path id="r" d="M185.5 208l1.3.1 1.2.5h3.2v1l-.1.4-.5.2-1 .1a3.5 3.5 0 01.3 1.3 3.4 3.4 0 01-1.3 2.7l-1.4.7a5.7 5.7 0 01-3 .2c-.3.2-.5.4-.5.7 0 .2 0 .4.3.5l.8.2h1.2a24.2 24.2 0 012.6.3l1.2.5c.3.2.6.4.8.8.2.3.3.8.3 1.3s0 1-.3 1.5l-1 1.2c-.6.4-1.1.7-1.8.9a8.9 8.9 0 01-4.5 0c-.7 0-1.2-.3-1.6-.6l-1-1-.2-1c0-.6.1-1 .4-1.4.4-.3.8-.6 1.4-.8l-.7-.7c-.2-.2-.2-.6-.2-1v-.5l.3-.5.5-.5.6-.4c-.6-.3-1-.8-1.3-1.3-.4-.5-.5-1-.5-1.8a3.4 3.4 0 011.2-2.6c.4-.4 1-.6 1.5-.8a6 6 0 011.8-.2zm3 12c0-.3-.1-.4-.3-.6l-.5-.3a14.7 14.7 0 00-2.8-.3l-1-.1-.8.6c-.2.2-.3.5-.3.8 0 .2 0 .4.2.5 0 .2.2.4.4.5l.9.3h2.5l1-.3.5-.5.1-.6zm-3-6.5l.8-.1c.3 0 .5-.2.7-.4l.3-.6.2-.7c0-.6-.2-1-.5-1.3-.4-.4-.9-.5-1.5-.5-.7 0-1.2.1-1.5.5-.4.3-.5.7-.5 1.3v.7a1.6 1.6 0 001 1l1 .1zm12.3-5.5c.8 0 1.6.1 2.2.4a5 5 0 013 3c.2.7.3 1.5.3 2.4a7 7 0 01-.4 2.4 4.9 4.9 0 01-2.9 3c-.6.3-1.4.4-2.2.4-.9 0-1.7-.1-2.3-.4a5 5 0 01-3-3c-.2-.7-.3-1.5-.3-2.4 0-.9 0-1.7.4-2.4.2-.7.6-1.3 1-1.8a5 5 0 011.9-1.2c.6-.3 1.4-.4 2.3-.4zm0 9.5c.9 0 1.6-.3 2-1 .5-.5.7-1.5.7-2.7 0-1.2-.2-2.1-.7-2.8-.4-.6-1.1-1-2-1-1 0-1.7.4-2.2 1-.4.7-.6 1.6-.6 2.8 0 1.2.2 2.1.6 2.8.5.6 1.2 1 2.2 1z"/>
|
||||
<path id="s" d="M252.8 219.6c-1 0-1.7-.2-2.2-.8-.6-.6-.8-1.3-.8-2.3v-6.3h-1.2l-.3-.1-.2-.5v-1l1.8-.3.6-3.1c0-.2 0-.3.2-.4H252.5v3.5h3v1.9h-3v6.1c0 .4 0 .6.3.8.1.2.4.3.7.3h.4a2.3 2.3 0 00.5-.3h.4l.2.2.8 1.3c-.4.4-.9.6-1.4.8a5 5 0 01-1.6.2zm9.7-11.6c.8 0 1.6.1 2.2.4a5 5 0 013 3c.2.7.3 1.5.3 2.4a7 7 0 01-.4 2.4 4.9 4.9 0 01-2.9 3c-.6.3-1.4.4-2.2.4-.9 0-1.7-.1-2.3-.4a5 5 0 01-3-3c-.2-.7-.3-1.5-.3-2.4 0-.9 0-1.7.4-2.4.2-.7.6-1.3 1-1.8a5 5 0 011.9-1.2c.6-.3 1.4-.4 2.3-.4zm0 9.5c.9 0 1.6-.3 2-1 .5-.5.7-1.5.7-2.7 0-1.2-.2-2.1-.7-2.8-.4-.6-1.1-1-2-1-1 0-1.7.4-2.2 1-.4.7-.6 1.6-.6 2.8 0 1.2.2 2.1.6 2.8.5.6 1.2 1 2.2 1z"/>
|
||||
<path id="t" d="M331.8 203.7h.4l.3.4 9.2 11.8a7.5 7.5 0 01-.1-1v-11.2h1.9v15.8h-1.1a1 1 0 01-.4-.1 1 1 0 01-.4-.3l-9-11.9a14.1 14.1 0 010 1v11.3h-2v-15.8h1.2zm14.6 14.5a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.3h.5l.4.4a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .3 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.3-1zm10-5v6.3h-2.1v-6.3l-5.8-9.5h1.9l.4.1.3.4 3.6 6.1a7.6 7.6 0 01.6 1.4 13 13 0 01.7-1.4l3.6-6.1.2-.3c.1-.2.3-.2.5-.2h1.9l-5.8 9.5zm5.1 5a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.3h.5l.5.4a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .3 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.4-1zm8.5-14.5v6.3a27.8 27.8 0 01-.3 4h-1.3a66.4 66.4 0 01-.3-4v-6.3h1.9zm-2.4 14.5a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.3h.5l.5.4a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .3 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.4-1z"/>
|
||||
<path id="u" d="M127 202.6l.6 1.2a4.3 4.3 0 01-.4 3.4c-.4.6-.9 1.3-1.6 1.8l-.8-.5-.2-.2v-.1l.1-.4a6.5 6.5 0 00.6-.8 2.9 2.9 0 00.3-1.3l-.1-.8c0-.3-.2-.6-.4-.9l-.1-.3c0-.2.1-.4.4-.5l1.6-.6zm9.7 7.8l-.2.2h-.3a29 29 0 00-1.6-.6h-1a2 2 0 00-1.2.3 1 1 0 00-.5.9c0 .2.1.5.3.6.1.2.3.3.6.4l.9.4a29 29 0 012 .7l.9.6c.3.2.5.5.6.8.2.3.3.7.3 1.2l-.3 1.5-.9 1.2a4 4 0 01-1.4.8 6.1 6.1 0 01-3 .1 6.7 6.7 0 01-2.1-.7l-.8-.6.7-1 .3-.3h.8a10.6 10.6 0 001.3.7l1 .1h.8l.6-.4.3-.4.1-.5c0-.3 0-.5-.2-.7a2 2 0 00-.6-.4 29.4 29.4 0 01-3-1l-.9-.7-.6-.9c-.2-.3-.2-.8-.2-1.3a3.3 3.3 0 011-2.4 4 4 0 011.4-.7 5.6 5.6 0 014 .1c.6.2 1.1.6 1.5 1l-.6 1z"/>
|
||||
<path id="v" d="M429.6 202.6l.5 1.2a4.3 4.3 0 01-.3 3.4c-.4.6-1 1.3-1.6 1.8l-.9-.5-.1-.2v-.1-.4a7.8 7.8 0 00.7-.8l.2-.6a2.5 2.5 0 000-1.5l-.5-.9v-.3c0-.2 0-.4.3-.5l1.7-.6zm3.6 0l.6 1.2a4.3 4.3 0 01-.4 3.4c-.3.6-.9 1.3-1.6 1.8l-.8-.5-.2-.2v-.1l.1-.4a7.8 7.8 0 00.6-.8l.2-.6a2.5 2.5 0 000-1.5c0-.3-.2-.6-.4-.9l-.1-.3c0-.2.1-.4.4-.5l1.6-.6z"/>
|
||||
<path id="w" d="M13.5 275.3c-.5-.9-.8-1.7-.8-2.5 0-.7.2-1.4.6-2.1.3-.7.9-1.3 1.6-1.9l.8.6.2.1v.4l-.1.1a5.7 5.7 0 00-.6.9l-.2.6-.1.6.1.8c0 .3.2.6.4 1l.1.2c0 .3-.1.4-.4.5l-1.6.7zm3.6 0c-.5-.9-.7-1.7-.7-2.5 0-.7.2-1.4.5-2.1.4-.7 1-1.3 1.6-1.9l.9.6.1.1v.5a5.7 5.7 0 00-.7.9l-.2.6v1.4l.5 1v.2c0 .3 0 .4-.3.5l-1.7.7z"/>
|
||||
<path id="x" d="M58.4 283.6h6.4v2.4h-9.4v-16h3v13.6zm12.5-9c.7 0 1.4 0 2 .3a4.3 4.3 0 012.5 2.6 6 6 0 01.4 2.7l-.1.3-.2.1-.3.1h-7c0 1.2.4 2 1 2.6a3 3 0 002 .8c.5 0 1 0 1.2-.2l.9-.3.6-.4.5-.1h.3l.2.2.8 1c-.3.4-.6.7-1 .9a5.7 5.7 0 01-2.4.9H71a6 6 0 01-2.1-.3c-.7-.3-1.3-.7-1.8-1.2s-.9-1.1-1.2-1.9a7.3 7.3 0 010-4.8c.2-.6.6-1.2 1-1.7a5 5 0 011.7-1.2c.7-.3 1.5-.5 2.3-.5zm0 1.9c-.7 0-1.3.2-1.8.7-.4.4-.7 1-.8 1.9h5v-1l-.5-.9a2 2 0 00-.8-.5l-1-.2zm10.8 9.7c-1 0-1.7-.3-2.2-.9-.6-.5-.8-1.3-.8-2.2v-6.4H77l-.1-.5V275l1.8-.3.6-3c0-.2 0-.3.2-.4l.4-.1h1.4v3.5h3v2h-3v6c0 .4 0 .7.2.9.2.2.5.3.8.3h.4a2.3 2.3 0 00.5-.3h.4l.2.2.8 1.3c-.4.3-.9.6-1.4.7a5 5 0 01-1.6.3z"/>
|
||||
<path id="y" d="M183 274.5c.4 0 .9 0 1.3.2.4 0 .8.2 1.2.4h3.2v1l-.1.4c0 .1-.2.2-.5.2l-1 .2a3.5 3.5 0 01.3 1.3 3.4 3.4 0 01-1.3 2.6l-1.4.8a5.7 5.7 0 01-3 .1c-.3.2-.5.4-.5.7 0 .2 0 .4.3.5l.8.2 1.2.1a24.2 24.2 0 012.6.3c.5 0 .9.2 1.2.4l.8.8c.2.4.3.8.3 1.3s0 1-.3 1.5l-1 1.3c-.6.3-1.1.6-1.8.8-.7.3-1.4.4-2.3.4-.9 0-1.6-.1-2.2-.3-.7-.1-1.2-.4-1.6-.6l-1-1-.2-1.1c0-.5.1-1 .4-1.3.4-.4.8-.7 1.4-.9l-.7-.6c-.2-.3-.2-.6-.2-1v-.5l.3-.6.5-.5.6-.4a3.3 3.3 0 01-1.8-3 3.4 3.4 0 011.2-2.7 6 6 0 013.2-1zm3 12c0-.2-.1-.4-.3-.5 0-.2-.3-.3-.5-.4a14.7 14.7 0 00-2.8-.3h-1c-.4.1-.6.3-.8.5-.2.3-.3.5-.3.8 0 .2 0 .4.2.6 0 .2.2.3.4.4.2.2.5.3.9.3l1.2.1h1.3l1-.4.5-.5.1-.6zm-3-6.4c.3 0 .6 0 .8-.2.3 0 .5-.2.7-.3l.3-.6.2-.8c0-.5-.2-1-.5-1.3-.4-.3-.9-.5-1.5-.5-.7 0-1.2.2-1.5.5-.4.3-.5.8-.5 1.3v.8a1.6 1.6 0 001 1h1zm12.3-5.6c.8 0 1.6.2 2.2.4a5 5 0 013 3c.2.7.3 1.5.3 2.4a7 7 0 01-.4 2.5 4.9 4.9 0 01-2.9 3 6.2 6.2 0 01-4.6 0 5 5 0 01-2.8-3c-.3-.8-.4-1.6-.4-2.5 0-.9 0-1.7.4-2.4.2-.7.6-1.3 1-1.8a5 5 0 011.9-1.2c.6-.2 1.4-.4 2.3-.4zm0 9.6c.9 0 1.6-.3 2-1 .5-.6.7-1.5.7-2.7 0-1.3-.2-2.2-.7-2.8-.4-.7-1.1-1-2-1-1 0-1.7.3-2.2 1-.4.6-.6 1.5-.6 2.8 0 1.2.2 2 .6 2.7.5.7 1.2 1 2.2 1z"/>
|
||||
<path id="z" d="M251.3 286.2c-1 0-1.7-.3-2.2-.9-.6-.5-.8-1.3-.8-2.2v-6.4h-1.5l-.2-.5V275l1.8-.3.6-3c0-.2 0-.3.2-.4l.4-.1h1.4v3.5h3v2h-3v6c0 .4 0 .7.3.9.1.2.4.3.7.3h.4a2.3 2.3 0 00.5-.3h.4l.2.2.8 1.3c-.4.3-.9.6-1.4.7a5 5 0 01-1.6.3zm9.7-11.7c.8 0 1.6.2 2.2.4a5 5 0 013 3c.2.7.3 1.5.3 2.4a7 7 0 01-.4 2.5 4.9 4.9 0 01-2.9 3 6.2 6.2 0 01-4.6 0 5 5 0 01-2.8-3c-.3-.8-.4-1.6-.4-2.5 0-.9 0-1.7.4-2.4.2-.7.6-1.3 1-1.8a5 5 0 011.9-1.2c.6-.2 1.4-.4 2.3-.4zm0 9.6c.9 0 1.6-.3 2-1 .5-.6.7-1.5.7-2.7 0-1.3-.2-2.2-.7-2.8-.4-.7-1.1-1-2-1-1 0-1.7.3-2.2 1-.4.6-.6 1.5-.6 2.8 0 1.2.2 2 .6 2.7.5.7 1.2 1 2.2 1z"/>
|
||||
<path id="A" d="M310.4 270.2l.4.1.4.3 9 11.9a7.5 7.5 0 010-1.1v-11.2h2V286H321a1 1 0 01-.4 0 1 1 0 01-.3-.4l-9.2-11.9a14.1 14.1 0 010 1V286h-1.8v-15.8h1.1zm14.6 14.6a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4l.5.1c.2 0 .3.2.5.3a1.3 1.3 0 01.3 1 1.4 1.4 0 01-.3 1 1.3 1.3 0 01-1 .4 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.4-1zm10-5v6.2H333v-6.3l-5.8-9.5h1.9c.2 0 .3 0 .4.2.2 0 .2.2.3.3l3.6 6.2a7.6 7.6 0 01.7 1.4 13 13 0 01.6-1.4l3.6-6.2.3-.3.4-.2h2l-5.9 9.5zm5.2 5a1.4 1.4 0 01.4-1 1.3 1.3 0 011-.4l.5.1.4.3a1.3 1.3 0 01.4 1 1.4 1.4 0 01-.4 1 1.3 1.3 0 01-1 .4 1.4 1.4 0 01-1-.4 1.4 1.4 0 01-.3-1z"/>
|
||||
<path id="B" d="M126.5 269.1l.6 1.3a4.3 4.3 0 01-.4 3.3c-.4.7-.9 1.3-1.6 1.9l-.8-.6-.2-.1v-.2l.1-.3a6.5 6.5 0 00.6-.9 2.9 2.9 0 00.3-1.2l-.1-.8c0-.3-.2-.6-.4-1l-.1-.2c0-.3.1-.4.4-.5l1.6-.7zm9.7 7.8l-.2.3h-.3a29 29 0 00-1.6-.6h-1a2 2 0 00-1.2.3 1 1 0 00-.5.9l.3.6.6.4.9.3a29 29 0 012 .8l.9.5c.3.3.5.5.6.9.2.3.3.7.3 1.1 0 .6-.1 1-.3 1.5-.2.5-.5.9-.9 1.2a4 4 0 01-1.4.8 6.1 6.1 0 01-3 .2 6.7 6.7 0 01-2.1-.8l-.8-.5.7-1c0-.2.1-.3.3-.3l.4-.1.4.1a10.6 10.6 0 001.3.6l1 .2.8-.1.6-.3.3-.5.1-.5c0-.3 0-.5-.2-.6a2 2 0 00-.6-.5 29.4 29.4 0 01-3-1l-.9-.7-.6-.8c-.2-.4-.2-.8-.2-1.3a3.3 3.3 0 011-2.4 4 4 0 011.4-.8 5.6 5.6 0 014 .1c.6.3 1.1.6 1.5 1l-.6 1z"/>
|
||||
<path id="C" d="M429.6 269.1l.5 1.3a4.3 4.3 0 01-.3 3.3c-.4.7-1 1.3-1.6 1.9l-.9-.6-.1-.1v-.2-.3a7.8 7.8 0 00.7-.9l.2-.6a2.5 2.5 0 000-1.4l-.5-1v-.2c0-.3 0-.4.3-.5l1.7-.7zm3.6 0l.6 1.3a4.3 4.3 0 01-.4 3.3c-.3.7-.9 1.3-1.6 1.9l-.8-.6-.2-.1v-.2l.1-.3a7.8 7.8 0 00.6-.9l.2-.6a2.5 2.5 0 000-1.4c0-.3-.2-.6-.4-1l-.1-.2c0-.3.1-.4.4-.5l1.6-.7z"/>
|
||||
<path id="D" d="M387.8 270v6.4a19.2 19.2 0 01-.3 4h-1.9a41.8 41.8 0 01-.3-4V270h2.5zm-3 14.5a1.7 1.7 0 01.5-1.2 1.7 1.7 0 011.2-.5 1.6 1.6 0 011.2.5 1.7 1.7 0 01.3 1.9c0 .2-.2.3-.3.5l-.5.3-.7.2a1.7 1.7 0 01-1.2-.5l-.3-.5-.2-.7z"/>
|
||||
<path id="E" d="M13.5 341.8c-.5-.8-.8-1.6-.8-2.5 0-.7.2-1.4.6-2 .3-.7.9-1.4 1.6-2l.8.6.2.2V336.4l-.1.2a5.7 5.7 0 00-.6.8l-.2.6-.1.7.1.8c0 .3.2.5.4.9l.1.3c0 .2-.1.4-.4.5l-1.6.6zm3.6 0c-.5-.8-.7-1.6-.7-2.5 0-.7.2-1.4.5-2 .4-.7 1-1.4 1.6-2l.9.6.1.2v.5a5.7 5.7 0 00-.7.8l-.2.6v1.5l.5.9v.3c0 .2 0 .4-.3.5l-1.7.6z"/>
|
||||
<path id="F" d="M60 350.1h6.3v2.4h-9.4v-15.9h3v13.5zm12.4-9c.7 0 1.4.1 2 .3a4.3 4.3 0 012.5 2.6 6 6 0 01.4 2.7l-.1.3-.2.2h-7.3c0 1.2.4 2 1 2.6a3 3 0 002 .8l1.2-.1.9-.4.6-.3.5-.2h.3l.2.3.8 1-1 .8a5.7 5.7 0 01-2.4 1h-1.3a6 6 0 01-2.1-.4c-.7-.2-1.3-.6-1.8-1.1-.5-.5-.9-1.2-1.2-2a7.3 7.3 0 010-4.7c.2-.7.6-1.3 1-1.8a5 5 0 011.7-1.2c.7-.3 1.5-.4 2.3-.4zm0 2c-.7 0-1.3.2-1.8.6-.4.4-.7 1-.8 1.9h5v-1l-.5-.8a2 2 0 00-.8-.6l-1-.2zm10.8 9.6c-1 0-1.7-.3-2.2-.8-.6-.6-.8-1.3-.8-2.3v-6.3H79l-.4-.1-.1-.5v-1l1.8-.4.6-3c0-.2 0-.3.2-.4H82.9v3.5h3v1.9h-3v6.1c0 .4 0 .6.2.8.2.2.5.3.8.3h.4a2.3 2.3 0 00.5-.3h.4l.2.2.8 1.3c-.4.4-.9.6-1.4.8a5 5 0 01-1.6.2z"/>
|
||||
<path id="G" d="M184 341l1.3.2 1.2.4h3.2v1l-.1.5-.5.2-1 .1a3.5 3.5 0 01.3 1.3 3.4 3.4 0 01-1.3 2.7l-1.4.7a5.7 5.7 0 01-3 .1c-.3.3-.5.5-.5.8 0 .2 0 .4.3.5l.8.2h1.2a24.2 24.2 0 012.6.3l1.2.5c.3.2.6.4.8.8.2.3.3.8.3 1.3s0 1-.3 1.4c-.3.5-.6 1-1 1.3-.6.4-1.1.7-1.8.9a8.9 8.9 0 01-4.5 0c-.7-.1-1.2-.3-1.6-.6l-1-1-.2-1c0-.6.1-1 .4-1.4.4-.4.8-.6 1.4-.9-.3-.1-.5-.3-.7-.6-.2-.2-.2-.6-.2-1v-.5l.3-.5.5-.5.6-.5a3.3 3.3 0 01-1.8-3 3.4 3.4 0 011.2-2.7c.4-.3 1-.5 1.5-.7a6 6 0 011.8-.2zm3 12c0-.2-.1-.3-.3-.5l-.5-.3a14.7 14.7 0 00-2.8-.3l-1-.1-.8.6c-.2.2-.3.5-.3.8 0 .2 0 .4.2.5 0 .2.2.4.4.5l.9.3h2.5l1-.4c.2 0 .4-.3.5-.4l.1-.6zm-3-6.4l.8-.1.7-.4.3-.6.2-.7c0-.6-.2-1-.5-1.3-.4-.4-.9-.5-1.5-.5-.7 0-1.2.1-1.5.5-.4.3-.5.7-.5 1.3v.7a1.6 1.6 0 001 1l1 .1zm12.3-5.5c.8 0 1.6.1 2.2.4a5 5 0 013 3c.2.7.3 1.5.3 2.4a7 7 0 01-.4 2.4 4.9 4.9 0 01-2.9 3c-.6.3-1.4.4-2.2.4-.9 0-1.7-.1-2.3-.4a5 5 0 01-3-3c-.2-.7-.3-1.5-.3-2.4 0-1 0-1.7.4-2.4.2-.7.6-1.4 1-1.9a5 5 0 011.9-1.1c.6-.3 1.4-.4 2.3-.4zm0 9.5c.9 0 1.6-.3 2-1 .5-.6.7-1.5.7-2.7 0-1.2-.2-2.1-.7-2.8-.4-.6-1.1-1-2-1-1 0-1.7.4-2.2 1-.4.7-.6 1.6-.6 2.8 0 1.2.2 2.1.6 2.8.5.6 1.2 1 2.2 1z"/>
|
||||
<path id="H" d="M249.8 352.7c-1 0-1.7-.3-2.2-.8-.6-.6-.8-1.3-.8-2.3v-6.3h-1.2l-.3-.1-.2-.5v-1l1.8-.4.6-3c0-.2 0-.3.2-.4H249.5v3.5h3v1.9h-3v6.1c0 .4 0 .6.3.8.1.2.4.3.7.3h.4a2.3 2.3 0 00.5-.3h.4l.2.2.8 1.3c-.4.4-.9.6-1.4.8a5 5 0 01-1.6.2zm9.7-11.6c.8 0 1.6.1 2.2.4a5 5 0 013 3c.2.7.3 1.5.3 2.4a7 7 0 01-.4 2.4 4.9 4.9 0 01-2.9 3c-.6.3-1.4.4-2.2.4-.9 0-1.7-.1-2.3-.4a5 5 0 01-3-3c-.2-.7-.3-1.5-.3-2.4 0-1 0-1.7.4-2.4.2-.7.6-1.4 1-1.9a5 5 0 011.9-1.1c.6-.3 1.4-.4 2.3-.4zm0 9.5c.9 0 1.6-.3 2-1 .5-.6.7-1.5.7-2.7 0-1.2-.2-2.1-.7-2.8-.4-.6-1.1-1-2-1-1 0-1.7.4-2.2 1-.4.7-.6 1.6-.6 2.8 0 1.2.2 2.1.6 2.8.5.6 1.2 1 2.2 1z"/>
|
||||
<path id="I" d="M312.3 336.6h.4l.2.1.2.2.2.2 8.4 10.6a11 11 0 010-1.4v-9.7h2.5v16h-1.5c-.2 0-.4 0-.6-.2-.2 0-.3-.2-.4-.4l-8.4-10.6a15.3 15.3 0 01.1 1.4v9.7h-2.6v-15.9h1.5zm14.3 14.4a1.7 1.7 0 01.5-1.1 1.7 1.7 0 011.2-.5 1.6 1.6 0 011.2.5 1.7 1.7 0 01.3 1.8c0 .2-.2.4-.3.5l-.6.4-.6.1a1.7 1.7 0 01-1.2-.5c-.1-.1-.3-.3-.3-.5l-.2-.7zm11-4.6v6.1h-3v-6.1l-5.7-9.8h2.6l.6.2.4.5 2.9 5.3a13.3 13.3 0 01.8 1.7 12 12 0 01.7-1.7l2.9-5.3c0-.2.2-.3.4-.5l.6-.2h2.6l-5.8 9.8zm4.7 4.6a1.7 1.7 0 01.5-1.1 1.7 1.7 0 011.2-.5 1.6 1.6 0 011.1.5 1.7 1.7 0 01.4 1.8c0 .2-.2.4-.4.5l-.5.4-.6.1a1.7 1.7 0 01-1.2-.5l-.4-.5-.1-.7z"/>
|
||||
<path id="J" d="M128 335.7l.6 1.2a4.3 4.3 0 01-.4 3.4c-.4.6-.9 1.2-1.6 1.8l-.8-.5-.2-.2v-.2l.1-.3a6.5 6.5 0 00.6-.9 2.9 2.9 0 00.3-1.2l-.1-.8c0-.3-.2-.6-.4-.9l-.1-.3c0-.2.1-.4.4-.5l1.6-.6zm9.7 7.8l-.2.2h-.3a29 29 0 00-1.6-.6h-1a2 2 0 00-1.2.3 1 1 0 00-.5.9l.3.6c.1.2.3.3.6.4l.9.4a29 29 0 012 .7l.9.6c.3.2.5.5.6.8.2.3.3.7.3 1.2l-.3 1.5-.9 1.2a4 4 0 01-1.4.7 6.1 6.1 0 01-3 .2 6.7 6.7 0 01-2.1-.7l-.8-.6.7-1 .3-.3h.8a10.6 10.6 0 001.3.7l1 .1.8-.1.6-.3.3-.4.1-.5c0-.3 0-.5-.2-.7a2 2 0 00-.6-.4 29.4 29.4 0 01-3-1l-.9-.7-.6-.9c-.2-.3-.2-.8-.2-1.3a3.3 3.3 0 011-2.4 4 4 0 011.4-.7 5.6 5.6 0 014 .1c.6.2 1.1.6 1.5 1l-.6 1z"/>
|
||||
<path id="K" d="M429.6 335.7l.5 1.2a4.3 4.3 0 01-.3 3.4l-1.6 1.8-.9-.5-.1-.2v-.2-.3a7.8 7.8 0 00.7-.9l.2-.5a2.5 2.5 0 000-1.5l-.5-.9v-.3c0-.2 0-.4.3-.5l1.7-.6zm3.6 0l.6 1.2a4.3 4.3 0 01-.4 3.4c-.3.6-.9 1.2-1.6 1.8l-.8-.5-.2-.2v-.2l.1-.3a7.8 7.8 0 00.6-.9l.2-.5a2.5 2.5 0 000-1.5c0-.3-.2-.6-.4-.9l-.1-.3c0-.2.1-.4.4-.5l1.6-.6z"/>
|
||||
<path id="L" d="M387.8 336.6v6.3a19.2 19.2 0 01-.3 4h-1.9a41.8 41.8 0 01-.3-4v-6.3h2.5zm-3 14.4a1.7 1.7 0 01.5-1.1 1.7 1.7 0 011.2-.5 1.6 1.6 0 011.2.5 1.7 1.7 0 01.3 1.8c0 .2-.2.4-.3.5l-.5.4-.7.1a1.7 1.7 0 01-1.2-.5l-.3-.5-.2-.7z"/>
|
||||
<path id="M" d="M16.4 11.3V15H14V4h3.9c.7 0 1.4.2 2 .3l1.3.8c.4.3.6.7.8 1.1l.3 1.4c0 .6-.1 1-.3 1.5a3 3 0 01-.8 1.2c-.4.3-.8.6-1.4.8-.5.2-1.2.2-2 .2h-1.3zm0-1.9h1.4c.6 0 1-.1 1.4-.4.3-.4.4-.8.4-1.4l-.1-.6a1.4 1.4 0 00-1-1H16.5v3.4zM26 11v4h-2.5V4H27c.8 0 1.5.2 2 .3l1.4.7c.4.3.6.6.8 1l.2 1.3-.1 1a3 3 0 01-1.1 1.6l-1 .5.5.3.4.5 2.3 3.8h-2.3c-.4 0-.7-.2-.9-.5l-1.8-3.2-.3-.3a1 1 0 00-.5 0H26zm0-1.8h1l1-.1.5-.4c.2-.1.3-.3.3-.5l.1-.7c0-.5-.1-.9-.4-1.1-.3-.3-.8-.4-1.5-.4h-1v3.2zm14.5-5.1v2H36v2.5h3.4v1.8H36v2.7h4.5V15h-7V4h7zM49 4v2h-4.5v2.7h3.7v2h-3.7V15h-2.6V4h7zM53 15h-2.5V4H53v11zm4.8-5.6L54.4 4h2.9l.2.3L59.7 8v-.2l.2-.1 1.9-3.3c0-.2.3-.3.5-.3h2.4l-3.4 5.2 3.5 5.7h-2.6l-.4-.1a1 1 0 01-.2-.3l-2.2-3.8-.1.3-2 3.5-.3.3-.4.1h-2.4l3.6-5.6z"/>
|
||||
<path id="N" d="M20 136.3l-.2.3h-.4-.3a67.9 67.9 0 00-1-.5l-.8-.2c-.5 0-.8.1-1 .4a1 1 0 00-.4.8c0 .2 0 .4.2.5.1.2.3.3.6.4l.7.3a19.6 19.6 0 011.8.7l.8.5a2.6 2.6 0 01.8 2c0 .5-.1 1-.3 1.4a3.3 3.3 0 01-2 2l-1.6.2a5.3 5.3 0 01-2.1-.4 6 6 0 01-1-.4 4 4 0 01-.7-.6l.8-1.2s0-.2.2-.2l.3-.1.5.1a32.8 32.8 0 001.1.7l1 .1c.4 0 .7 0 1-.3.3-.2.4-.5.4-1 0-.2 0-.4-.2-.6l-.6-.4a23.2 23.2 0 01-2.6-.9l-.7-.5-.6-1a3.5 3.5 0 010-2.4l.8-1c.3-.3.7-.6 1.2-.8.4-.2 1-.2 1.6-.2a6 6 0 011.9.3 5 5 0 011.4.8l-.6 1.2zm6.6 6.7c.3 0 .6 0 .9-.2.3 0 .5-.2.7-.5l.4-.7.1-1V134h2.6v6.4a5 5 0 01-.4 1.9 4.1 4.1 0 01-2.4 2.4c-.5.2-1.2.3-2 .3-.7 0-1.3 0-1.9-.3-.6-.2-1-.6-1.5-1-.4-.4-.7-.8-.9-1.4-.2-.6-.3-1.2-.3-1.9v-6.4h2.5v6.4c0 .4 0 .8.2 1 0 .4.2.6.4.8.2.3.4.4.7.5l.9.2zm13.4-9v2h-4.5v2.8h3.7v2h-3.7v4.2h-2.6v-11h7zm8.3 0v2h-4.5v2.8h3.8v2h-3.8v4.2h-2.5v-11h7zm4.1 11H50v-11h2.5v11zm4.7-5.6l-3.4-5.3h2.9l.2.3L59 138l.1-.2.1-.1 2-3.3c0-.2.2-.3.4-.3h2.5l-3.5 5.2 3.5 5.7h-2.5l-.4-.1a1 1 0 01-.2-.3l-2.2-3.8-.2.3-2 3.5-.3.3-.3.1h-2.4l3.5-5.6z"/>
|
||||
<path id="O" d="M20 201.3l-.2.3h-.4-.3a67.9 67.9 0 00-1-.5l-.8-.2c-.5 0-.8.1-1 .4a1 1 0 00-.4.8c0 .2 0 .4.2.5.1.2.3.3.6.4l.7.3a19.6 19.6 0 011.8.7l.8.5a2.6 2.6 0 01.8 2c0 .5-.1 1-.3 1.4a3.3 3.3 0 01-2 2l-1.6.2a5.3 5.3 0 01-2.1-.4 6 6 0 01-1-.4 4 4 0 01-.7-.6l.8-1.2s0-.2.2-.2l.3-.1.5.1a32.8 32.8 0 001.1.7l1 .1c.4 0 .7 0 1-.3.3-.2.4-.5.4-1 0-.2 0-.4-.2-.6l-.6-.4a23.2 23.2 0 01-2.6-.9l-.7-.5-.6-1a3.5 3.5 0 010-2.4l.8-1c.3-.3.7-.6 1.2-.8.4-.2 1-.2 1.6-.2a6 6 0 011.9.3 5 5 0 011.4.8l-.6 1.2zm6.6 6.7c.3 0 .6 0 .9-.2.3 0 .5-.2.7-.5l.4-.7.1-1V199h2.6v6.4a5 5 0 01-.4 1.9 4.1 4.1 0 01-2.4 2.4c-.5.2-1.2.3-2 .3-.7 0-1.3 0-1.9-.3-.6-.2-1-.6-1.5-1-.4-.4-.7-.8-.9-1.4-.2-.6-.3-1.2-.3-1.9v-6.4h2.5v6.4c0 .4 0 .8.2 1 0 .4.2.6.4.8.2.3.4.4.7.5l.9.2zm13.4-9v2h-4.5v2.8h3.7v2h-3.7v4.2h-2.6v-11h7zm8.3 0v2h-4.5v2.8h3.8v2h-3.8v4.2h-2.5v-11h7zm4.1 11H50v-11h2.5v11zm4.7-5.6l-3.4-5.3h2.9l.2.3L59 203l.1-.2.1-.1 2-3.3c0-.2.2-.3.4-.3h2.5l-3.5 5.2 3.5 5.7h-2.5l-.4-.1a1 1 0 01-.2-.3l-2.2-3.8-.2.3-2 3.5-.3.3-.3.1h-2.4l3.5-5.6z"/>
|
||||
<path id="P" d="M8 264v2H3.4v2.6h3.4v1.8H3.4v2.7H8v1.9H1v-11h7zm4 5.4L8.8 264h2.9l.2.3L14 268v-.2l.2-.1 1.9-3.3c0-.2.3-.3.5-.3H19l-3.4 5.2L19 275h-2.6l-.4-.1a1 1 0 01-.2-.3l-2.2-3.8-.1.3-2 3.5-.3.3-.4.1H8.6l3.5-5.6zm15.2 2.8h.2l.1.1 1 1c-.4.7-1 1-1.6 1.4-.7.3-1.5.4-2.4.4-.8 0-1.5-.1-2.2-.4a4.8 4.8 0 01-2.7-3 6.5 6.5 0 010-4.4 5.2 5.2 0 013-3 6.3 6.3 0 014.5 0 4.8 4.8 0 011.6 1.1l-.9 1.2-.2.1-.3.1H27l-.2-.2a45 45 0 00-1.2-.5 3.5 3.5 0 00-2 .2l-1 .7-.6 1-.2 1.5c0 .6 0 1 .2 1.5l.6 1.1a2.6 2.6 0 002 1l.7-.1a2.6 2.6 0 001.5-.7h.2l.2-.1zm9.5-8.1v2h-4.5v2.5h3.5v1.8h-3.5v2.7h4.5v1.9h-7v-11h7zm4 7.2v3.7h-2.5v-11H42c.8 0 1.4.2 2 .3.6.2 1 .5 1.4.8l.8 1.1.2 1.4c0 .6 0 1-.3 1.5a3 3 0 01-.8 1.2l-1.3.8c-.6.2-1.2.2-2 .2h-1.3zm0-1.9H42c.7 0 1.1-.1 1.4-.4.3-.4.5-.8.5-1.4l-.1-.6a1.4 1.4 0 00-1-1h-2.1v3.4zm15-5.3v2h-3.1v8.9H50v-9H47v-2h8.7zm3.8 10.9h-2.6v-11h2.6v11zm12.8-5.5c0 .8-.1 1.6-.4 2.2a5.3 5.3 0 01-5.3 3.4c-.8 0-1.6-.1-2.3-.4a5.3 5.3 0 01-3.4-5.2c0-.8.2-1.5.5-2.2a5.2 5.2 0 013-3c.6-.2 1.4-.3 2.2-.3a6 6 0 012.4.4 5.4 5.4 0 013.3 5.1zm-2.6 0c0-.5 0-1-.2-1.4a3 3 0 00-.6-1.1c-.3-.3-.6-.6-1-.7a3.4 3.4 0 00-2.6 0c-.4.1-.7.4-1 .7a3 3 0 00-.5 1l-.3 1.5c0 .6.1 1 .3 1.5 0 .4.3.8.6 1.1.2.3.5.5 1 .7l1.2.2c.5 0 1 0 1.3-.2l1-.7c.3-.3.5-.7.6-1.1l.2-1.5zm5.1-5.4h.5l.2.2.2.2 5.2 6.5a13.8 13.8 0 010-1.1V264H83V275h-1.9a1 1 0 01-.4-.4l-5.1-6.5a23.3 23.3 0 010 1v5.9h-2.2v-11h1.3z"/>
|
||||
<path id="Q" d="M31.8 334.5c0 .8-.1 1.6-.4 2.2a5.1 5.1 0 01-3 2.9c-.6.3-1.4.4-2.3.4H22v-11h4.2c.9 0 1.7.2 2.4.5s1.3.6 1.8 1.1c.5.5.8 1 1.1 1.8.3.6.4 1.3.4 2.1zm-2.6 0c0-.5 0-1-.2-1.4-.1-.5-.3-.8-.6-1.1-.3-.3-.6-.6-1-.7-.3-.2-.8-.3-1.3-.3h-1.7v7h1.7c.5 0 1 0 1.3-.2l1-.7c.3-.3.5-.7.6-1.1.2-.4.2-1 .2-1.5zm14.6 0c0 .8-.1 1.6-.4 2.2a5.3 5.3 0 01-5.3 3.4c-.8 0-1.6-.1-2.3-.4a5.3 5.3 0 01-3.3-5.2c0-.8.1-1.5.4-2.2a5.2 5.2 0 013-3c.6-.2 1.4-.3 2.2-.3a6 6 0 012.4.4 5.4 5.4 0 013.3 5.1zm-2.6 0c0-.5 0-1-.2-1.4a3 3 0 00-.6-1.1c-.3-.3-.6-.6-1-.7a3.4 3.4 0 00-2.6 0l-1 .7a3 3 0 00-.5 1c-.2.5-.2 1-.2 1.5 0 .6 0 1 .2 1.5.1.4.3.8.6 1.1.2.3.6.5 1 .7l1.2.2c.5 0 1 0 1.3-.2l1-.7c.3-.3.5-.7.6-1.1.2-.4.2-1 .2-1.5zm5.2-5.4h.4l.2.2.2.2 5.2 6.5a13.8 13.8 0 010-1.1V329h2.2V340H52.8a1 1 0 01-.4-.4l-5.2-6.5a23.3 23.3 0 010 1v5.9H45v-11h1.4zm17 0v2H59v2.5h3.5v1.8h-3.5v2.7h4.5v1.9h-7v-11h7z"/>
|
||||
<path id="R" d="M8 69v2H3.4v2.6h3.4v1.8H3.4V78H8v2H1V69h7zm4 5.4L8.8 69h2.9l.2.3L14 73v-.2l.2-.1 1.9-3.3c0-.2.3-.3.5-.3H19l-3.4 5.2L19 80h-2.6l-.4-.1a1 1 0 01-.2-.3l-2.2-3.8-.1.3-2 3.5-.3.3-.4.1H8.6l3.5-5.6zm15.2 2.8h.2l.1.1 1 1c-.4.7-1 1-1.6 1.4-.7.3-1.5.4-2.4.4-.8 0-1.5-.1-2.2-.4a4.8 4.8 0 01-2.7-3 6.5 6.5 0 010-4.4 5.2 5.2 0 013-3 6.3 6.3 0 014.5 0 4.8 4.8 0 011.6 1.1l-.9 1.2-.2.1-.3.1H27l-.2-.2a45 45 0 00-1.2-.5 3.5 3.5 0 00-2 .2l-1 .7-.6 1-.2 1.5c0 .6 0 1 .2 1.5l.6 1.1a2.6 2.6 0 002 1l.7-.1a2.6 2.6 0 001.5-.7h.2l.2-.1zm9.5-8.1v2h-4.5v2.5h3.5v1.8h-3.5V78h4.5v2h-7V69h7zm4 7.2V80h-2.5V69H42c.8 0 1.4.2 2 .3.6.2 1 .5 1.4.8l.8 1.1.2 1.4c0 .6 0 1-.3 1.5a3 3 0 01-.8 1.2l-1.3.8c-.6.2-1.2.2-2 .2h-1.3zm0-1.9H42c.7 0 1.1-.1 1.4-.4.3-.4.5-.8.5-1.4l-.1-.6a1.4 1.4 0 00-1-1h-2.1v3.4zm15-5.3v2h-3.1V80H50v-9H47v-2h8.7zM59.5 80h-2.6V69h2.6v11zm12.8-5.5c0 .8-.1 1.6-.4 2.2a5.3 5.3 0 01-5.3 3.4c-.8 0-1.6-.1-2.3-.4a5.3 5.3 0 01-3.4-5.2c0-.8.2-1.5.5-2.2a5.2 5.2 0 013-3c.6-.2 1.4-.3 2.2-.3a6 6 0 012.4.4 5.4 5.4 0 013.3 5.1zm-2.6 0c0-.5 0-1-.2-1.4a3 3 0 00-.6-1.1c-.3-.3-.6-.6-1-.7a3.4 3.4 0 00-2.6 0c-.4.1-.7.4-1 .7a3 3 0 00-.5 1l-.3 1.5c0 .6.1 1 .3 1.5 0 .4.3.8.6 1.1.2.3.5.5 1 .7l1.2.2c.5 0 1 0 1.3-.2l1-.7c.3-.3.5-.7.6-1.1l.2-1.5zm5.1-5.4h.5l.2.2.2.2 5.2 6.5a13.8 13.8 0 010-1.1V69H83V80h-1.9a1 1 0 01-.4-.4L75.8 73a23.3 23.3 0 010 1V80h-2.2V69h1.3z"/>
|
||||
</defs>
|
||||
<g fill="none" fill-rule="evenodd">
|
||||
<g stroke-linejoin="round" stroke-width="3.8">
|
||||
<path stroke="#3AC" d="M82.4 46.5v13h-60v12m60-25v13h21.8v12"/>
|
||||
<path fill="#C3E7F1" stroke="#3AC" d="M6 5h152.7v41.7H6z"/>
|
||||
<path fill="#F5F5F5" stroke="#B7B7B7" d="M195.8 46.5v25"/>
|
||||
<path fill="#F5F5F5" stroke="#B7B7B7" d="M168.5 5h54.6v41.7h-54.6z"/>
|
||||
<path fill="#F5F5F5" stroke="#B7B7B7" d="M261.3 46.5v25"/>
|
||||
<path fill="#F5F5F5" stroke="#B7B7B7" d="M234 5h54.5v41.7H234z"/>
|
||||
<path fill="#F5F5F5" stroke="#B7B7B7" d="M377 46.5v25"/>
|
||||
<path fill="#F5F5F5" stroke="#B7B7B7" d="M299.5 5h153.8v41.7H299.5z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M22.4 113v21.8"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M6 71.5h32.7v41.7H6z"/>
|
||||
<path stroke="#3AC" d="M104.2 113v12H76.9v9.8m27.3-21.8v12h31.6v9.8"/>
|
||||
<path fill="#C3E7F1" stroke="#3AC" d="M49.6 71.5h109.1v41.7H49.6z"/>
|
||||
<path fill="#F5F5F5" stroke="#B7B7B7" d="M195.8 113v21.8"/>
|
||||
<path fill="#F5F5F5" stroke="#B7B7B7" d="M168.5 71.5h54.6v41.7h-54.6z"/>
|
||||
<path fill="#F5F5F5" stroke="#B7B7B7" d="M261.3 113v21.8"/>
|
||||
<path fill="#F5F5F5" stroke="#B7B7B7" d="M234 71.5h54.5v41.7H234z"/>
|
||||
<path fill="#F5F5F5" stroke="#B7B7B7" d="M377 113v21.8"/>
|
||||
<path fill="#F5F5F5" stroke="#B7B7B7" d="M299.5 71.5h153.8v41.7H299.5z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M6 134.8h32.7v41.5H6z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M77 176.3v26.2"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M49.6 134.8h54.6v41.5H49.6z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M195.8 176.3v26.2"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M168.5 134.8h54.6v41.5h-54.6z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M261.3 176.3v26.2"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M234 134.8h54.5v41.5H234z"/>
|
||||
<path stroke="#3AC" d="M377 176.3v14.2h-22v12m22-26.2v14.2h60v12"/>
|
||||
<path fill="#C3E7F1" stroke="#3AC" d="M299.5 134.8h153.8v41.5H299.5z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M135.8 176.3v26.2"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M114 134.8h43.6v41.5H114z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M22.4 244v25"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M6 202.2h32.7v41.7H6z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M77 244v25"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M49.6 202.2h54.6v41.7H49.6z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M195.8 244v25"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M168.5 202.2h54.6v41.7h-54.6z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M261.3 244v25"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M234 202.2h54.5v41.7H234z"/>
|
||||
<path stroke="#3AC" d="M355 244v12h-21.7v13m21.8-25v12h37v13"/>
|
||||
<path fill="#C3E7F1" stroke="#3AC" d="M299.5 202.2h110.1v41.7H299.5z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M135.8 244v25"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M114 202.2h43.6v41.7H114z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M437 244v25"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M420.5 202.2h32.8v41.7h-32.8z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M22.4 310.5v25"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M6 268.7h32.7v41.8H6z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M77 310.5v25"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M49.6 268.7h54.6v41.8H49.6z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M195.8 310.5v21.8-18.6 21.8"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M168.5 268.7h54.6v41.8h-54.6z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M234 268.7h54.5v41.8H234z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M333.3 310.5v25"/>
|
||||
<path fill="#C3E7F1" stroke="#3AC" d="M299.5 268.7H366v41.8h-66.5z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M135.8 310.5v25"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M114 268.7h43.6v41.8H114z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M437 310.5v25"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M420.5 268.7h32.8v41.8h-32.8z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M392.2 310.5v25"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M375.8 268.7h32.7v41.8h-32.7z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M8 335.5h28.7a2 2 0 012 2V375a2 2 0 01-2 2H8a2 2 0 01-2-2v-37.5c0-1 .9-2 2-2z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M49.6 335.5h54.6V377H49.6z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M168.5 335.5h54.6V377h-54.6z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M234 335.5h54.5V377H234z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M299.5 335.5H366V377h-66.5z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M114 335.5h43.6V377H114z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M420.5 335.5h32.8V377h-32.8z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M375.8 335.5h32.7V377h-32.7z"/>
|
||||
<path fill="#B5F3D4" stroke="#3AD787" d="M261.3 310.5v25"/>
|
||||
</g>
|
||||
<g fill-rule="nonzero">
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#a"/>
|
||||
<use fill="#1A1E23" xlink:href="#a"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#b"/>
|
||||
<use fill="#1A1E23" xlink:href="#b"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#c"/>
|
||||
<use fill="#1A1E23" xlink:href="#c"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#d"/>
|
||||
<use fill="#1A1E23" xlink:href="#d"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#e"/>
|
||||
<use fill="#1A1E23" xlink:href="#e"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#f"/>
|
||||
<use fill="#1A1E23" xlink:href="#f"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#g"/>
|
||||
<use fill="#1A1E23" xlink:href="#g"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#h"/>
|
||||
<use fill="#1A1E23" xlink:href="#h"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#i"/>
|
||||
<use fill="#1A1E23" xlink:href="#i"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#j"/>
|
||||
<use fill="#1A1E23" xlink:href="#j"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#k"/>
|
||||
<use fill="#1A1E23" xlink:href="#k"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#l"/>
|
||||
<use fill="#1A1E23" xlink:href="#l"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#m"/>
|
||||
<use fill="#1A1E23" xlink:href="#m"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#n"/>
|
||||
<use fill="#1A1E23" xlink:href="#n"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#o"/>
|
||||
<use fill="#1A1E23" xlink:href="#o"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#p"/>
|
||||
<use fill="#1A1E23" xlink:href="#p"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#q"/>
|
||||
<use fill="#1A1E23" xlink:href="#q"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#r"/>
|
||||
<use fill="#1A1E23" xlink:href="#r"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#s"/>
|
||||
<use fill="#1A1E23" xlink:href="#s"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#t"/>
|
||||
<use fill="#1A1E23" xlink:href="#t"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#u"/>
|
||||
<use fill="#1A1E23" xlink:href="#u"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#v"/>
|
||||
<use fill="#1A1E23" xlink:href="#v"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#w"/>
|
||||
<use fill="#1A1E23" xlink:href="#w"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#x"/>
|
||||
<use fill="#1A1E23" xlink:href="#x"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#y"/>
|
||||
<use fill="#1A1E23" xlink:href="#y"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#z"/>
|
||||
<use fill="#1A1E23" xlink:href="#z"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#A"/>
|
||||
<use fill="#1A1E23" xlink:href="#A"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#B"/>
|
||||
<use fill="#1A1E23" xlink:href="#B"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#C"/>
|
||||
<use fill="#1A1E23" xlink:href="#C"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#D"/>
|
||||
<use fill="#1A1E23" xlink:href="#D"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#E"/>
|
||||
<use fill="#1A1E23" xlink:href="#E"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#F"/>
|
||||
<use fill="#1A1E23" xlink:href="#F"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#G"/>
|
||||
<use fill="#1A1E23" xlink:href="#G"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#H"/>
|
||||
<use fill="#1A1E23" xlink:href="#H"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#I"/>
|
||||
<use fill="#1A1E23" xlink:href="#I"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#J"/>
|
||||
<use fill="#1A1E23" xlink:href="#J"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#K"/>
|
||||
<use fill="#1A1E23" xlink:href="#K"/>
|
||||
</g>
|
||||
<g transform="translate(6 11)">
|
||||
<use fill="#3D4251" xlink:href="#L"/>
|
||||
<use fill="#1A1E23" xlink:href="#L"/>
|
||||
</g>
|
||||
</g>
|
||||
<rect width="101" height="20" x="483" y="16" fill="#3AC" fill-rule="nonzero" stroke="#3AC" stroke-width="2.2" rx="10"/>
|
||||
<rect width="101" height="20" x="483" y="146" fill="#3AC" fill-rule="nonzero" stroke="#3AC" stroke-width="2.2" rx="10"/>
|
||||
<rect width="101" height="20" x="483" y="211" fill="#3AC" fill-rule="nonzero" stroke="#3AC" stroke-width="2.2" rx="10"/>
|
||||
<rect width="101" height="20" x="483" y="276" fill="#3AC" fill-rule="nonzero" stroke="#3AC" stroke-width="2.2" rx="10"/>
|
||||
<rect width="101" height="20" x="483" y="341" fill="#3AD787" fill-rule="nonzero" stroke="#3AD787" stroke-width="2.2" rx="10"/>
|
||||
<rect width="101" height="20" x="483" y="81" fill="#3AC" fill-rule="nonzero" stroke="#3AC" stroke-width="2.2" rx="10"/>
|
||||
<g fill-rule="nonzero">
|
||||
<g transform="translate(493 16)">
|
||||
<use fill="#000" xlink:href="#M"/>
|
||||
<use fill="#FFF" xlink:href="#M"/>
|
||||
</g>
|
||||
<g transform="translate(493 16)">
|
||||
<use fill="#000" xlink:href="#N"/>
|
||||
<use fill="#FFF" xlink:href="#N"/>
|
||||
</g>
|
||||
<g transform="translate(493 16)">
|
||||
<use fill="#000" xlink:href="#O"/>
|
||||
<use fill="#FFF" xlink:href="#O"/>
|
||||
</g>
|
||||
<g transform="translate(493 16)">
|
||||
<use fill="#000" xlink:href="#P"/>
|
||||
<use fill="#FFF" xlink:href="#P"/>
|
||||
</g>
|
||||
<g transform="translate(493 16)">
|
||||
<use fill="#000" xlink:href="#Q"/>
|
||||
<use fill="#FFF" xlink:href="#Q"/>
|
||||
</g>
|
||||
<g transform="translate(493 16)">
|
||||
<use fill="#000" xlink:href="#R"/>
|
||||
<use fill="#FFF" xlink:href="#R"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
||||
|
|
Before Width: | Height: | Size: 12 KiB After Width: | Height: | Size: 45 KiB |
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user