diff --git a/pyproject.toml b/pyproject.toml
index e88ba7db9..611a95d27 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
- "thinc>=8.0.0a42,<8.0.0a50",
+ "thinc>=8.0.0a43,<8.0.0a50",
"blis>=0.4.0,<0.5.0",
"pytokenizations",
"pathy"
diff --git a/requirements.txt b/requirements.txt
index 064efed42..44dad38e3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,12 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a42,<8.0.0a50
+thinc>=8.0.0a43,<8.0.0a50
blis>=0.4.0,<0.5.0
ml_datasets==0.2.0a0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.8.0,<1.1.0
-srsly>=2.1.0,<3.0.0
+srsly>=2.3.0,<3.0.0
catalogue>=2.0.1,<2.1.0
typer>=0.3.0,<0.4.0
pathy
diff --git a/setup.cfg b/setup.cfg
index 36ab64bd9..7192ba9d4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,16 +34,16 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
- thinc>=8.0.0a42,<8.0.0a50
+ thinc>=8.0.0a43,<8.0.0a50
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
- thinc>=8.0.0a42,<8.0.0a50
+ thinc>=8.0.0a43,<8.0.0a50
blis>=0.4.0,<0.5.0
wasabi>=0.8.0,<1.1.0
- srsly>=2.1.0,<3.0.0
+ srsly>=2.3.0,<3.0.0
catalogue>=2.0.1,<2.1.0
typer>=0.3.0,<0.4.0
pathy
@@ -66,6 +66,8 @@ console_scripts =
[options.extras_require]
lookups =
spacy_lookups_data==1.0.0rc0
+transformers =
+ spacy_transformers>=1.0.0a17,<1.0.0
cuda =
cupy>=5.0.0b4,<9.0.0
cuda80 =
@@ -84,7 +86,7 @@ cuda102 =
cupy-cuda102>=5.0.0b4,<9.0.0
# Language tokenizers with external dependencies
ja =
- sudachipy>=0.4.5
+ sudachipy>=0.4.9
sudachidict_core>=20200330
ko =
natto-py==0.9.0
diff --git a/spacy/about.py b/spacy/about.py
index 18fc77184..acf386ace 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy-nightly"
-__version__ = "3.0.0a28"
+__version__ = "3.0.0a29"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 69dac0aa1..3bd237b0a 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -37,6 +37,22 @@ tokenizer_config = {"use_fast": true}
window = 128
stride = 96
+{% if "morphologizer" in components %}
+[components.morphologizer]
+factory = "morphologizer"
+
+[components.morphologizer.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.morphologizer.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+
+[components.morphologizer.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+{%- endif %}
+
{% if "tagger" in components %}
[components.tagger]
factory = "tagger"
@@ -166,6 +182,19 @@ depth = {{ 4 if optimize == "efficiency" else 8 }}
window_size = 1
maxout_pieces = 3
+{% if "morphologizer" in components %}
+[components.morphologizer]
+factory = "morphologizer"
+
+[components.morphologizer.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.morphologizer.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+{%- endif %}
+
{% if "tagger" in components %}
[components.tagger]
factory = "tagger"
@@ -257,7 +286,7 @@ no_output_layer = false
{% endif %}
{% for pipe in components %}
-{% if pipe not in ["tagger", "parser", "ner", "textcat", "entity_linker"] %}
+{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "entity_linker"] %}
{# Other components defined by the user: we just assume they're factories #}
[components.{{ pipe }}]
factory = "{{ pipe }}"
diff --git a/spacy/default_config_pretraining.cfg b/spacy/default_config_pretraining.cfg
index 4011159a4..66987171a 100644
--- a/spacy/default_config_pretraining.cfg
+++ b/spacy/default_config_pretraining.cfg
@@ -34,7 +34,7 @@ learn_rate = 0.001
[corpora]
[corpora.pretrain]
-@readers = "spacy.JsonlReader.v1"
+@readers = "spacy.JsonlCorpus.v1"
path = ${paths.raw_text}
min_length = 5
max_length = 500
diff --git a/spacy/errors.py b/spacy/errors.py
index 5236992e9..dbb25479d 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -419,7 +419,7 @@ class Errors:
E164 = ("x is neither increasing nor decreasing: {}.")
E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
"that case.")
- E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
+ E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
"Current DocBin: {current}\nOther DocBin: {other}")
E169 = ("Can't find module: {module}")
E170 = ("Cannot apply transition {name}: invalid for the current state.")
@@ -477,12 +477,8 @@ class Errors:
E201 = ("Span index out of range.")
# TODO: fix numbering after merging develop into master
- E912 = ("No orth_variants lookups table for data augmentation available for "
- "language '{lang}'. If orth_variants are available in "
- "spacy-lookups-data, make sure the package is installed and the "
- "table is loaded in the [initialize.lookups] block of your config. "
- "Alternatively, you can provide your own Lookups object with a "
- "table orth_variants as the argument 'lookuos' of the augmenter.")
+ E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found "
+ "for mode '{mode}'. Required tables: {tables}. Found: {found}.")
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
"config.cfg or override it on the CLI?")
E914 = ("Executing {name} callback failed. Expected the function to "
@@ -562,10 +558,10 @@ class Errors:
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
"component.")
- E955 = ("Can't find table(s) '{table}' for language '{lang}' in "
- "spacy-lookups-data. If you want to initialize a blank nlp object, "
- "make sure you have the spacy-lookups-data package installed or "
- "remove the [initialize.lookups] block from your config.")
+ E955 = ("Can't find table(s) {table} for language '{lang}' in "
+ "spacy-lookups-data. Make sure you have the package installed or "
+ "provide your own lookup tables if no default lookups are available "
+ "for your language.")
E956 = ("Can't find component '{name}' in [components] block in the config. "
"Available components: {opts}")
E957 = ("Writing directly to Language.factories isn't needed anymore in "
@@ -691,9 +687,8 @@ class Errors:
E1002 = ("Span index out of range.")
E1003 = ("Unsupported lemmatizer mode '{mode}'.")
E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
- "Required tables '{tables}', found '{found}'. If you are not "
- "providing custom lookups, make sure you have the package "
- "spacy-lookups-data installed.")
+ "Required tables: {tables}. Found: {found}. Maybe you forgot to "
+ "call nlp.initialize() to load in the data?")
E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
"'{chunk}'. Tokenizer exceptions are only allowed to specify "
"`ORTH` and `NORM`.")
diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py
index 923e29a17..879229888 100644
--- a/spacy/lang/bn/__init__.py
+++ b/spacy/lang/bn/__init__.py
@@ -4,7 +4,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from ...language import Language
-from ...lookups import Lookups
from ...pipeline import Lemmatizer
@@ -24,18 +23,11 @@ class Bengali(Language):
@Bengali.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "rule", "lookups": None},
+ default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0},
)
-def make_lemmatizer(
- nlp: Language,
- model: Optional[Model],
- name: str,
- mode: str,
- lookups: Optional[Lookups],
-):
- lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
- return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+ return Lemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["Bengali"]
diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py
index 1a7b19914..53069334e 100644
--- a/spacy/lang/el/__init__.py
+++ b/spacy/lang/el/__init__.py
@@ -7,7 +7,6 @@ from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .lemmatizer import GreekLemmatizer
-from ...lookups import Lookups
from ...language import Language
@@ -29,18 +28,11 @@ class Greek(Language):
@Greek.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "rule", "lookups": None},
+ default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0},
)
-def make_lemmatizer(
- nlp: Language,
- model: Optional[Model],
- name: str,
- mode: str,
- lookups: Optional[Lookups],
-):
- lookups = GreekLemmatizer.load_lookups(nlp.lang, mode, lookups)
- return GreekLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+ return GreekLemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["Greek"]
diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py
index cc01f1aea..3a3ebeefd 100644
--- a/spacy/lang/en/__init__.py
+++ b/spacy/lang/en/__init__.py
@@ -8,7 +8,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES
from .lemmatizer import EnglishLemmatizer
from ...language import Language
-from ...lookups import Lookups
class EnglishDefaults(Language.Defaults):
@@ -27,18 +26,11 @@ class English(Language):
@English.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "rule", "lookups": None},
+ default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0},
)
-def make_lemmatizer(
- nlp: Language,
- model: Optional[Model],
- name: str,
- mode: str,
- lookups: Optional[Lookups],
-):
- lookups = EnglishLemmatizer.load_lookups(nlp.lang, mode, lookups)
- return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+ return EnglishLemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["English"]
diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py
index f3a6635dc..77ee3bca3 100644
--- a/spacy/lang/fa/__init__.py
+++ b/spacy/lang/fa/__init__.py
@@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_SUFFIXES
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
-from ...lookups import Lookups
from ...pipeline import Lemmatizer
@@ -27,18 +26,11 @@ class Persian(Language):
@Persian.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "rule", "lookups": None},
+ default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0},
)
-def make_lemmatizer(
- nlp: Language,
- model: Optional[Model],
- name: str,
- mode: str,
- lookups: Optional[Lookups],
-):
- lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
- return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+ return Lemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["Persian"]
diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py
index 72e641d1f..1e0011fba 100644
--- a/spacy/lang/fr/__init__.py
+++ b/spacy/lang/fr/__init__.py
@@ -9,7 +9,6 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .lemmatizer import FrenchLemmatizer
-from ...lookups import Lookups
from ...language import Language
@@ -32,18 +31,11 @@ class French(Language):
@French.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "rule", "lookups": None},
+ default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0},
)
-def make_lemmatizer(
- nlp: Language,
- model: Optional[Model],
- name: str,
- mode: str,
- lookups: Optional[Lookups],
-):
- lookups = FrenchLemmatizer.load_lookups(nlp.lang, mode, lookups)
- return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+ return FrenchLemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["French"]
diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py
index 9672dfd6e..62d7707f3 100644
--- a/spacy/lang/nb/__init__.py
+++ b/spacy/lang/nb/__init__.py
@@ -6,7 +6,6 @@ from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
-from ...lookups import Lookups
from ...pipeline import Lemmatizer
@@ -27,18 +26,11 @@ class Norwegian(Language):
@Norwegian.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "rule", "lookups": None},
+ default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0},
)
-def make_lemmatizer(
- nlp: Language,
- model: Optional[Model],
- name: str,
- mode: str,
- lookups: Optional[Lookups],
-):
- lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
- return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+ return Lemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["Norwegian"]
diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py
index 15b6b9de2..a3591f1bf 100644
--- a/spacy/lang/nl/__init__.py
+++ b/spacy/lang/nl/__init__.py
@@ -1,5 +1,4 @@
from typing import Optional
-
from thinc.api import Model
from .stop_words import STOP_WORDS
@@ -8,7 +7,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from .lemmatizer import DutchLemmatizer
-from ...lookups import Lookups
from ...language import Language
@@ -29,18 +27,11 @@ class Dutch(Language):
@Dutch.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "rule", "lookups": None},
+ default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0},
)
-def make_lemmatizer(
- nlp: Language,
- model: Optional[Model],
- name: str,
- mode: str,
- lookups: Optional[Lookups],
-):
- lookups = DutchLemmatizer.load_lookups(nlp.lang, mode, lookups)
- return DutchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+ return DutchLemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["Dutch"]
diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py
index 573dbc6f9..f7be8a6c2 100644
--- a/spacy/lang/pl/__init__.py
+++ b/spacy/lang/pl/__init__.py
@@ -34,18 +34,11 @@ class Polish(Language):
@Polish.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "pos_lookup", "lookups": None},
+ default_config={"model": None, "mode": "pos_lookup"},
default_score_weights={"lemma_acc": 1.0},
)
-def make_lemmatizer(
- nlp: Language,
- model: Optional[Model],
- name: str,
- mode: str,
- lookups: Optional[Lookups],
-):
- lookups = PolishLemmatizer.load_lookups(nlp.lang, mode, lookups)
- return PolishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+ return PolishLemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["Polish"]
diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py
index 6436ae0c7..1d59ca043 100644
--- a/spacy/lang/ru/__init__.py
+++ b/spacy/lang/ru/__init__.py
@@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer
from ...language import Language
-from ...lookups import Lookups
class RussianDefaults(Language.Defaults):
@@ -23,17 +22,11 @@ class Russian(Language):
@Russian.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "pymorphy2", "lookups": None},
+ default_config={"model": None, "mode": "pymorphy2"},
default_score_weights={"lemma_acc": 1.0},
)
-def make_lemmatizer(
- nlp: Language,
- model: Optional[Model],
- name: str,
- mode: str,
- lookups: Optional[Lookups],
-):
- return RussianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+ return RussianLemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["Russian"]
diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py
index ea314f487..2490eb9ec 100644
--- a/spacy/lang/sv/__init__.py
+++ b/spacy/lang/sv/__init__.py
@@ -5,7 +5,6 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
-from ...lookups import Lookups
from ...pipeline import Lemmatizer
@@ -30,18 +29,11 @@ class Swedish(Language):
@Swedish.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "rule", "lookups": None},
+ default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0},
)
-def make_lemmatizer(
- nlp: Language,
- model: Optional[Model],
- name: str,
- mode: str,
- lookups: Optional[Lookups],
-):
- lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
- return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+ return Lemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["Swedish"]
diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py
index 006a1cf7f..73c065379 100644
--- a/spacy/lang/uk/__init__.py
+++ b/spacy/lang/uk/__init__.py
@@ -7,7 +7,6 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import UkrainianLemmatizer
from ...language import Language
-from ...lookups import Lookups
class UkrainianDefaults(Language.Defaults):
@@ -24,17 +23,11 @@ class Ukrainian(Language):
@Ukrainian.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "pymorphy2", "lookups": None},
+ default_config={"model": None, "mode": "pymorphy2"},
default_score_weights={"lemma_acc": 1.0},
)
-def make_lemmatizer(
- nlp: Language,
- model: Optional[Model],
- name: str,
- mode: str,
- lookups: Optional[Lookups],
-):
- return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+ return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["Ukrainian"]
diff --git a/spacy/ml/featureextractor.py b/spacy/ml/featureextractor.py
index dcf212628..ed2918f02 100644
--- a/spacy/ml/featureextractor.py
+++ b/spacy/ml/featureextractor.py
@@ -1,7 +1,8 @@
from typing import List, Union, Callable, Tuple
-from thinc.types import Ints2d, Doc
+from thinc.types import Ints2d
from thinc.api import Model, registry
+from ..tokens import Doc
@registry.layers("spacy.FeatureExtractor.v1")
@@ -9,7 +10,9 @@ def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[In
return Model("extract_features", forward, attrs={"columns": columns})
-def forward(model: Model[List[Doc], List[Ints2d]], docs, is_train: bool) -> Tuple[List[Ints2d], Callable]:
+def forward(
+ model: Model[List[Doc], List[Ints2d]], docs, is_train: bool
+) -> Tuple[List[Ints2d], Callable]:
columns = model.attrs["columns"]
features: List[Ints2d] = []
for doc in docs:
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 95f9c66df..120e9b02c 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -1,4 +1,4 @@
-from typing import Optional, List
+from typing import Optional, List, Union
from thinc.types import Floats2d
from thinc.api import chain, clone, concatenate, with_array, with_padded
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
@@ -10,7 +10,7 @@ from ...ml import _character_embed
from ..staticvectors import StaticVectors
from ..featureextractor import FeatureExtractor
from ...pipeline.tok2vec import Tok2VecListener
-from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
+from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
@registry.architectures.register("spacy.Tok2VecListener.v1")
@@ -98,7 +98,7 @@ def MultiHashEmbed(
attributes using hash embedding, concatenates the results, and passes it
through a feed-forward subnetwork to build a mixed representations.
- The features used are the NORM, PREFIX, SUFFIX and SHAPE, which can have
+ The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have
varying definitions depending on the Vocab of the Doc object passed in.
Vectors from pretrained static vectors can also be incorporated into the
concatenated representation.
@@ -115,7 +115,7 @@ def MultiHashEmbed(
also_use_static_vectors (bool): Whether to also use static word vectors.
Requires a vectors table to be loaded in the Doc objects' vocab.
"""
- cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
+ cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH]
seed = 7
def make_hash_embed(feature):
@@ -123,7 +123,7 @@ def MultiHashEmbed(
seed += 1
return HashEmbed(
width,
- rows if feature == NORM else rows // 2,
+ rows if feature == LOWER else rows // 2,
column=cols.index(feature),
seed=seed,
dropout=0.0,
@@ -131,13 +131,13 @@ def MultiHashEmbed(
if also_embed_subwords:
embeddings = [
- make_hash_embed(NORM),
+ make_hash_embed(LOWER),
make_hash_embed(PREFIX),
make_hash_embed(SUFFIX),
make_hash_embed(SHAPE),
]
else:
- embeddings = [make_hash_embed(NORM)]
+ embeddings = [make_hash_embed(LOWER)]
concat_size = width * (len(embeddings) + also_use_static_vectors)
if also_use_static_vectors:
model = chain(
@@ -165,7 +165,8 @@ def MultiHashEmbed(
@registry.architectures.register("spacy.CharacterEmbed.v1")
def CharacterEmbed(
- width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool
+ width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool,
+ feature: Union[int, str]="LOWER"
):
"""Construct an embedded representation based on character embeddings, using
a feed-forward network. A fixed number of UTF-8 byte characters are used for
@@ -179,12 +180,13 @@ def CharacterEmbed(
of being in an arbitrary position depending on the word length.
The characters are embedded in a embedding table with a given number of rows,
- and the vectors concatenated. A hash-embedded vector of the NORM of the word is
+ and the vectors concatenated. A hash-embedded vector of the LOWER of the word is
also concatenated on, and the result is then passed through a feed-forward
network to construct a single vector to represent the information.
- width (int): The width of the output vector and the NORM hash embedding.
- rows (int): The number of rows in the NORM hash embedding table.
+ feature (int or str): An attribute to embed, to concatenate with the characters.
+ width (int): The width of the output vector and the feature embedding.
+ rows (int): The number of rows in the LOWER hash embedding table.
nM (int): The dimensionality of the character embeddings. Recommended values
are between 16 and 64.
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
@@ -193,12 +195,15 @@ def CharacterEmbed(
also_use_static_vectors (bool): Whether to also use static word vectors.
Requires a vectors table to be loaded in the Doc objects' vocab.
"""
+ feature = intify_attr(feature)
+ if feature is None:
+ raise ValueError("Invalid feature: Must be a token attribute.")
if also_use_static_vectors:
model = chain(
concatenate(
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
chain(
- FeatureExtractor([NORM]),
+ FeatureExtractor([feature]),
list2ragged(),
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
),
@@ -214,7 +219,7 @@ def CharacterEmbed(
concatenate(
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
chain(
- FeatureExtractor([NORM]),
+ FeatureExtractor([feature]),
list2ragged(),
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
),
diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py
index 391769604..9be596868 100644
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@@ -1,26 +1,25 @@
-from typing import Optional, List, Dict, Any
+from typing import Optional, List, Dict, Any, Callable, Iterable, Iterator, Union
+from typing import Tuple
from thinc.api import Model
+from pathlib import Path
from .pipe import Pipe
from ..errors import Errors
from ..language import Language
+from ..training import Example
from ..lookups import Lookups, load_lookups
from ..scorer import Scorer
from ..tokens import Doc, Token
from ..vocab import Vocab
from ..training import validate_examples
+from ..util import logger, SimpleFrozenList
from .. import util
@Language.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={
- "model": None,
- "mode": "lookup",
- "lookups": None,
- "overwrite": False,
- },
+ default_config={"model": None, "mode": "lookup", "overwrite": False},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
@@ -28,13 +27,9 @@ def make_lemmatizer(
model: Optional[Model],
name: str,
mode: str,
- lookups: Optional[Lookups],
overwrite: bool = False,
):
- lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
- return Lemmatizer(
- nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite
- )
+ return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
class Lemmatizer(Pipe):
@@ -46,59 +41,19 @@ class Lemmatizer(Pipe):
"""
@classmethod
- def get_lookups_config(cls, mode: str) -> Dict:
+ def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
"""Returns the lookups configuration settings for a given mode for use
in Lemmatizer.load_lookups.
mode (str): The lemmatizer mode.
- RETURNS (dict): The lookups configuration settings for this mode.
-
- DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
+ RETURNS (Tuple[List[str], List[str]]): The required and optional
+ lookup tables for this mode.
"""
if mode == "lookup":
- return {
- "required_tables": ["lemma_lookup"],
- }
+ return (["lemma_lookup"], [])
elif mode == "rule":
- return {
- "required_tables": ["lemma_rules"],
- "optional_tables": ["lemma_exc", "lemma_index"],
- }
- return {}
-
- @classmethod
- def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups]) -> Lookups:
- """Load and validate lookups tables. If the provided lookups is None,
- load the default lookups tables according to the language and mode
- settings. Confirm that all required tables for the language and mode
- are present.
-
- lang (str): The language code.
- mode (str): The lemmatizer mode.
- lookups (Lookups): The provided lookups, may be None if the default
- lookups should be loaded.
- RETURNS (Lookups): The Lookups object.
-
- DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
- """
- config = cls.get_lookups_config(mode)
- required_tables = config.get("required_tables", [])
- optional_tables = config.get("optional_tables", [])
- if lookups is None:
- lookups = load_lookups(lang=lang, tables=required_tables)
- optional_lookups = load_lookups(
- lang=lang, tables=optional_tables, strict=False
- )
- for table in optional_lookups.tables:
- lookups.set_table(table, optional_lookups.get_table(table))
- for table in required_tables:
- if table not in lookups:
- raise ValueError(
- Errors.E1004.format(
- mode=mode, tables=required_tables, found=lookups.tables
- )
- )
- return lookups
+ return (["lemma_rules"], ["lemma_exc", "lemma_index"])
+ return ([], [])
def __init__(
self,
@@ -107,7 +62,6 @@ class Lemmatizer(Pipe):
name: str = "lemmatizer",
*,
mode: str = "lookup",
- lookups: Optional[Lookups] = None,
overwrite: bool = False,
) -> None:
"""Initialize a Lemmatizer.
@@ -116,9 +70,6 @@ class Lemmatizer(Pipe):
model (Model): A model (not yet implemented).
name (str): The component name. Defaults to "lemmatizer".
mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
- lookups (Lookups): The lookups object containing the (optional) tables
- such as "lemma_rules", "lemma_index", "lemma_exc" and
- "lemma_lookup". Defaults to None
overwrite (bool): Whether to overwrite existing lemmas. Defaults to
`False`.
@@ -128,8 +79,9 @@ class Lemmatizer(Pipe):
self.model = model
self.name = name
self._mode = mode
- self.lookups = lookups if lookups is not None else Lookups()
+ self.lookups = Lookups()
self.overwrite = overwrite
+ self._validated = False
if self.mode == "lookup":
self.lemmatize = self.lookup_lemmatize
elif self.mode == "rule":
@@ -153,12 +105,56 @@ class Lemmatizer(Pipe):
DOCS: https://nightly.spacy.io/api/lemmatizer#call
"""
+ if not self._validated:
+ self._validate_tables(Errors.E1004)
for token in doc:
if self.overwrite or token.lemma == 0:
token.lemma_ = self.lemmatize(token)[0]
return doc
- def pipe(self, stream, *, batch_size=128):
+ def initialize(
+ self,
+ get_examples: Optional[Callable[[], Iterable[Example]]] = None,
+ *,
+ nlp: Optional[Language] = None,
+ lookups: Optional[Lookups] = None,
+ ):
+ """Initialize the lemmatizer and load in data.
+
+ get_examples (Callable[[], Iterable[Example]]): Function that
+ returns a representative sample of gold-standard Example objects.
+ nlp (Language): The current nlp object the component is part of.
+ lookups (Lookups): The lookups object containing the (optional) tables
+ such as "lemma_rules", "lemma_index", "lemma_exc" and
+ "lemma_lookup". Defaults to None.
+ """
+ required_tables, optional_tables = self.get_lookups_config(self.mode)
+ if lookups is None:
+ logger.debug("Lemmatizer: loading tables from spacy-lookups-data")
+ lookups = load_lookups(lang=self.vocab.lang, tables=required_tables)
+ optional_lookups = load_lookups(
+ lang=self.vocab.lang, tables=optional_tables, strict=False
+ )
+ for table in optional_lookups.tables:
+ lookups.set_table(table, optional_lookups.get_table(table))
+ self.lookups = lookups
+ self._validate_tables(Errors.E1004)
+
+ def _validate_tables(self, error_message: str = Errors.E912) -> None:
+ """Check that the lookups are correct for the current mode."""
+ required_tables, optional_tables = self.get_lookups_config(self.mode)
+ for table in required_tables:
+ if table not in self.lookups:
+ raise ValueError(
+ error_message.format(
+ mode=self.mode,
+ tables=required_tables,
+ found=self.lookups.tables,
+ )
+ )
+ self._validated = True
+
+ def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
"""Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are
applied to the Doc.
@@ -263,7 +259,7 @@ class Lemmatizer(Pipe):
"""
return False
- def score(self, examples, **kwargs) -> Dict[str, Any]:
+ def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
@@ -274,58 +270,66 @@ class Lemmatizer(Pipe):
validate_examples(examples, "Lemmatizer.score")
return Scorer.score_token_attr(examples, "lemma", **kwargs)
- def to_disk(self, path, *, exclude=tuple()):
- """Save the current state to a directory.
+ def to_disk(
+ self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
+ ):
+ """Serialize the pipe to disk.
- path (unicode or Path): A path to a directory, which will be created if
- it doesn't exist.
- exclude (list): String names of serialization fields to exclude.
+ path (str / Path): Path to a directory.
+ exclude (Iterable[str]): String names of serialization fields to exclude.
- DOCS: https://nightly.spacy.io/api/vocab#to_disk
+ DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk
"""
serialize = {}
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
serialize["lookups"] = lambda p: self.lookups.to_disk(p)
util.to_disk(path, serialize, exclude)
- def from_disk(self, path, *, exclude=tuple()):
- """Loads state from a directory. Modifies the object in place and
- returns it.
+ def from_disk(
+ self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
+ ) -> "Lemmatizer":
+ """Load the pipe from disk. Modifies the object in place and returns it.
- path (unicode or Path): A path to a directory.
- exclude (list): String names of serialization fields to exclude.
- RETURNS (Vocab): The modified `Vocab` object.
+ path (str / Path): Path to a directory.
+ exclude (Iterable[str]): String names of serialization fields to exclude.
+ RETURNS (Lemmatizer): The modified Lemmatizer object.
- DOCS: https://nightly.spacy.io/api/vocab#to_disk
+ DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk
"""
deserialize = {}
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
util.from_disk(path, deserialize, exclude)
+ self._validate_tables()
+ return self
- def to_bytes(self, *, exclude=tuple()) -> bytes:
- """Serialize the current state to a binary string.
+ def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
+ """Serialize the pipe to a bytestring.
- exclude (list): String names of serialization fields to exclude.
- RETURNS (bytes): The serialized form of the `Vocab` object.
+ exclude (Iterable[str]): String names of serialization fields to exclude.
+ RETURNS (bytes): The serialized object.
- DOCS: https://nightly.spacy.io/api/vocab#to_bytes
+ DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes
"""
serialize = {}
serialize["vocab"] = self.vocab.to_bytes
serialize["lookups"] = self.lookups.to_bytes
return util.to_bytes(serialize, exclude)
- def from_bytes(self, bytes_data: bytes, *, exclude=tuple()):
- """Load state from a binary string.
+ def from_bytes(
+ self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
+ ) -> "Lemmatizer":
+ """Load the pipe from a bytestring.
- bytes_data (bytes): The data to load from.
- exclude (list): String names of serialization fields to exclude.
- RETURNS (Vocab): The `Vocab` object.
+ bytes_data (bytes): The serialized pipe.
+ exclude (Iterable[str]): String names of serialization fields to exclude.
+ RETURNS (Lemmatizer): The loaded Lemmatizer.
- DOCS: https://nightly.spacy.io/api/vocab#from_bytes
+ DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes
"""
deserialize = {}
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
util.from_bytes(bytes_data, deserialize, exclude)
+ self._validate_tables()
+ return self
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 1125fa7da..591b7e134 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -282,7 +282,7 @@ class ModelMetaSchema(BaseModel):
sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources")
vectors: Dict[str, Any] = Field({}, title="Included word vectors")
labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name")
- performance: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy and speed numbers")
+ performance: Dict[str, Union[float, Dict[str, Union[float, dict]]]] = Field({}, title="Accuracy and speed numbers")
spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used")
# fmt: on
diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py
index 56c80dd66..b44b13d4c 100644
--- a/spacy/tests/doc/test_morphanalysis.py
+++ b/spacy/tests/doc/test_morphanalysis.py
@@ -77,7 +77,7 @@ def test_morph_property(tokenizer):
assert doc.to_array(["MORPH"])[0] != 0
# unset with token.morph
- doc[0].set_morph(0)
+ doc[0].set_morph(None)
assert doc.to_array(["MORPH"])[0] == 0
# empty morph is equivalent to "_"
diff --git a/spacy/tests/lang/test_lemmatizers.py b/spacy/tests/lang/test_lemmatizers.py
index 6e7f82341..5f45664eb 100644
--- a/spacy/tests/lang/test_lemmatizers.py
+++ b/spacy/tests/lang/test_lemmatizers.py
@@ -17,16 +17,31 @@ def test_lemmatizer_initialize(lang, capfd):
@registry.misc("lemmatizer_init_lookups")
def lemmatizer_init_lookups():
lookups = Lookups()
- lookups.add_table("lemma_lookup", {"cope": "cope"})
+ lookups.add_table("lemma_lookup", {"cope": "cope", "x": "y"})
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
return lookups
- """Test that languages can be initialized."""
+ # Test that languages can be initialized
nlp = get_lang_class(lang)()
- nlp.add_pipe("lemmatizer", config={"lookups": {"@misc": "lemmatizer_init_lookups"}})
+ lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
+ assert not lemmatizer.lookups.tables
+ nlp.config["initialize"]["components"]["lemmatizer"] = {
+ "lookups": {"@misc": "lemmatizer_init_lookups"}
+ }
+ with pytest.raises(ValueError):
+ nlp("x")
+ nlp.initialize()
+ assert lemmatizer.lookups.tables
+ doc = nlp("x")
# Check for stray print statements (see #3342)
- doc = nlp("test") # noqa: F841
captured = capfd.readouterr()
assert not captured.out
+ assert doc[0].lemma_ == "y"
+
+ # Test initialization by calling .initialize() directly
+ nlp = get_lang_class(lang)()
+ lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
+ lemmatizer.initialize(lookups=lemmatizer_init_lookups())
+ assert nlp("x")[0].lemma_ == "y"
diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py
index 05e15bc16..d37c87059 100644
--- a/spacy/tests/pipeline/test_lemmatizer.py
+++ b/spacy/tests/pipeline/test_lemmatizer.py
@@ -8,61 +8,52 @@ from ..util import make_tempdir
@pytest.fixture
def nlp():
- return English()
-
-
-@pytest.fixture
-def lemmatizer(nlp):
@registry.misc("cope_lookups")
def cope_lookups():
lookups = Lookups()
- lookups.add_table("lemma_lookup", {"cope": "cope"})
+ lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
return lookups
- lemmatizer = nlp.add_pipe(
- "lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
- )
- return lemmatizer
+ nlp = English()
+ nlp.config["initialize"]["components"]["lemmatizer"] = {
+ "lookups": {"@misc": "cope_lookups"}
+ }
+ return nlp
def test_lemmatizer_init(nlp):
- @registry.misc("cope_lookups")
- def cope_lookups():
- lookups = Lookups()
- lookups.add_table("lemma_lookup", {"cope": "cope"})
- lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
- lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
- lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
- return lookups
-
- lemmatizer = nlp.add_pipe(
- "lemmatizer", config={"mode": "lookup", "lookups": {"@misc": "cope_lookups"}}
- )
+ lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
assert isinstance(lemmatizer.lookups, Lookups)
+ assert not lemmatizer.lookups.tables
assert lemmatizer.mode == "lookup"
+ with pytest.raises(ValueError):
+ nlp("test")
+ nlp.initialize()
+ assert lemmatizer.lookups.tables
+ assert nlp("cope")[0].lemma_ == "cope"
+ assert nlp("coped")[0].lemma_ == "cope"
# replace any tables from spacy-lookups-data
lemmatizer.lookups = Lookups()
- doc = nlp("coping")
# lookup with no tables sets text as lemma
- assert doc[0].lemma_ == "coping"
-
+ assert nlp("cope")[0].lemma_ == "cope"
+ assert nlp("coped")[0].lemma_ == "coped"
nlp.remove_pipe("lemmatizer")
-
- @registry.misc("empty_lookups")
- def empty_lookups():
- return Lookups()
-
+ lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
with pytest.raises(ValueError):
- nlp.add_pipe(
- "lemmatizer",
- config={"mode": "lookup", "lookups": {"@misc": "empty_lookups"}},
- )
+ # Can't initialize without required tables
+ lemmatizer.initialize(lookups=Lookups())
+ lookups = Lookups()
+ lookups.add_table("lemma_lookup", {})
+ lemmatizer.initialize(lookups=lookups)
-def test_lemmatizer_config(nlp, lemmatizer):
+def test_lemmatizer_config(nlp):
+ lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
+ nlp.initialize()
+
doc = nlp.make_doc("coping")
doc[0].pos_ = "VERB"
assert doc[0].lemma_ == ""
@@ -78,20 +69,21 @@ def test_lemmatizer_config(nlp, lemmatizer):
assert doc[0].lemma_ == "cope"
-def test_lemmatizer_serialize(nlp, lemmatizer):
- @registry.misc("cope_lookups")
+def test_lemmatizer_serialize(nlp):
+ lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
+ nlp.initialize()
+
def cope_lookups():
lookups = Lookups()
- lookups.add_table("lemma_lookup", {"cope": "cope"})
+ lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
return lookups
nlp2 = English()
- lemmatizer2 = nlp2.add_pipe(
- "lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
- )
+ lemmatizer2 = nlp2.add_pipe("lemmatizer", config={"mode": "rule"})
+ lemmatizer2.initialize(lookups=cope_lookups())
lemmatizer2.from_bytes(lemmatizer.to_bytes())
assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables
@@ -100,9 +92,9 @@ def test_lemmatizer_serialize(nlp, lemmatizer):
with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir)
- doc2 = nlp2.make_doc("coping")
- doc2[0].pos_ = "VERB"
- assert doc2[0].lemma_ == ""
- doc2 = lemmatizer(doc2)
- assert doc2[0].text == "coping"
- assert doc2[0].lemma_ == "cope"
+ doc2 = nlp2.make_doc("coping")
+ doc2[0].pos_ = "VERB"
+ assert doc2[0].lemma_ == ""
+ doc2 = lemmatizer(doc2)
+ assert doc2[0].text == "coping"
+ assert doc2[0].lemma_ == "cope"
diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py
index 4a976fc02..8b6adb83b 100644
--- a/spacy/tests/serialize/test_serialize_doc.py
+++ b/spacy/tests/serialize/test_serialize_doc.py
@@ -1,3 +1,6 @@
+import pytest
+from spacy.tokens.doc import Underscore
+
import spacy
from spacy.lang.en import English
from spacy.tokens import Doc, DocBin
@@ -86,3 +89,20 @@ def test_serialize_doc_bin_unknown_spaces(en_vocab):
assert re_doc1.text == "that 's "
assert not re_doc2.has_unknown_spaces
assert re_doc2.text == "that's"
+
+
+@pytest.mark.parametrize(
+ "writer_flag,reader_flag,reader_value", [(True, True, "bar"), (True, False, "bar"), (False, True, "nothing"), (False, False, "nothing")]
+)
+def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag, reader_value):
+ """Test that custom extensions are correctly serialized in DocBin."""
+ Doc.set_extension("foo", default="nothing")
+ doc = Doc(en_vocab, words=["hello", "world"])
+ doc._.foo = "bar"
+ doc_bin_1 = DocBin(store_user_data=writer_flag)
+ doc_bin_1.add(doc)
+ doc_bin_bytes = doc_bin_1.to_bytes()
+ doc_bin_2 = DocBin(store_user_data=reader_flag).from_bytes(doc_bin_bytes)
+ doc_2 = list(doc_bin_2.get_docs(en_vocab))[0]
+ assert doc_2._.foo == reader_value
+ Underscore.doc_extensions = {}
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 405801f62..7d41c8908 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -7,11 +7,11 @@ from spacy.training.converters import json_to_docs
from spacy.training.augment import create_orth_variants_augmenter
from spacy.lang.en import English
from spacy.tokens import Doc, DocBin
-from spacy.lookups import Lookups
from spacy.util import get_words_and_spaces, minibatch
from thinc.api import compounding
import pytest
import srsly
+import random
from ..util import make_tempdir
@@ -504,9 +504,9 @@ def test_make_orth_variants(doc):
{"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
]
}
- lookups = Lookups()
- lookups.add_table("orth_variants", orth_variants)
- augmenter = create_orth_variants_augmenter(level=0.2, lower=0.5, lookups=lookups)
+ augmenter = create_orth_variants_augmenter(
+ level=0.2, lower=0.5, orth_variants=orth_variants
+ )
with make_tempdir() as tmpdir:
output_file = tmpdir / "roundtrip.spacy"
DocBin(docs=[doc]).to_disk(output_file)
@@ -515,6 +515,39 @@ def test_make_orth_variants(doc):
list(reader(nlp))
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_custom_data_augmentation(doc):
+ def create_spongebob_augmenter(randomize: bool = False):
+ def augment(nlp, example):
+ text = example.text
+ if randomize:
+ ch = [c.lower() if random.random() < 0.5 else c.upper() for c in text]
+ else:
+ ch = [c.lower() if i % 2 else c.upper() for i, c in enumerate(text)]
+ example_dict = example.to_dict()
+ doc = nlp.make_doc("".join(ch))
+ example_dict["token_annotation"]["ORTH"] = [t.text for t in doc]
+ yield example
+ yield example.from_dict(doc, example_dict)
+
+ return augment
+
+ nlp = English()
+ with make_tempdir() as tmpdir:
+ output_file = tmpdir / "roundtrip.spacy"
+ DocBin(docs=[doc]).to_disk(output_file)
+ reader = Corpus(output_file, augmenter=create_spongebob_augmenter())
+ corpus = list(reader(nlp))
+ orig_text = "Sarah 's sister flew to Silicon Valley via London . "
+ augmented = "SaRaH 's sIsTeR FlEw tO SiLiCoN VaLlEy vIa lOnDoN . "
+ assert corpus[0].text == orig_text
+ assert corpus[0].reference.text == orig_text
+ assert corpus[0].predicted.text == orig_text
+ assert corpus[1].text == augmented
+ assert corpus[1].reference.text == augmented
+ assert corpus[1].predicted.text == augmented
+
+
@pytest.mark.skip("Outdated")
@pytest.mark.parametrize(
"tokens_a,tokens_b,expected",
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index ed283a86b..11eb75821 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -58,7 +58,7 @@ class DocBin:
attrs (Iterable[str]): List of attributes to serialize. 'orth' and
'spacy' are always serialized, so they're not required.
- store_user_data (bool): Whether to include the `Doc.user_data`.
+ store_user_data (bool): Whether to write the `Doc.user_data` to bytes/file.
docs (Iterable[Doc]): Docs to add.
DOCS: https://nightly.spacy.io/api/docbin#init
@@ -106,11 +106,12 @@ class DocBin:
self.strings.add(token.ent_type_)
self.strings.add(token.ent_kb_id_)
self.cats.append(doc.cats)
- if self.store_user_data:
- self.user_data.append(srsly.msgpack_dumps(doc.user_data))
+ self.user_data.append(srsly.msgpack_dumps(doc.user_data))
def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
"""Recover Doc objects from the annotations, using the given vocab.
+ Note that the user data of each doc will be read (if available) and returned,
+ regardless of the setting of 'self.store_user_data'.
vocab (Vocab): The shared vocab.
YIELDS (Doc): The Doc objects.
@@ -129,7 +130,7 @@ class DocBin:
doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)
doc = doc.from_array(self.attrs, tokens)
doc.cats = self.cats[i]
- if self.store_user_data:
+ if i < len(self.user_data) and self.user_data[i] is not None:
user_data = srsly.msgpack_loads(self.user_data[i], use_list=False)
doc.user_data.update(user_data)
yield doc
@@ -137,21 +138,31 @@ class DocBin:
def merge(self, other: "DocBin") -> None:
"""Extend the annotations of this DocBin with the annotations from
another. Will raise an error if the pre-defined attrs of the two
- DocBins don't match.
+ DocBins don't match, or if they differ in whether or not to store
+ user data.
other (DocBin): The DocBin to merge into the current bin.
DOCS: https://nightly.spacy.io/api/docbin#merge
"""
if self.attrs != other.attrs:
- raise ValueError(Errors.E166.format(current=self.attrs, other=other.attrs))
+ raise ValueError(
+ Errors.E166.format(param="attrs", current=self.attrs, other=other.attrs)
+ )
+ if self.store_user_data != other.store_user_data:
+ raise ValueError(
+ Errors.E166.format(
+ param="store_user_data",
+ current=self.store_user_data,
+ other=other.store_user_data,
+ )
+ )
self.tokens.extend(other.tokens)
self.spaces.extend(other.spaces)
self.strings.update(other.strings)
self.cats.extend(other.cats)
self.flags.extend(other.flags)
- if self.store_user_data:
- self.user_data.extend(other.user_data)
+ self.user_data.extend(other.user_data)
def to_bytes(self) -> bytes:
"""Serialize the DocBin's annotations to a bytestring.
@@ -200,8 +211,10 @@ class DocBin:
self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
self.cats = msg["cats"]
self.flags = msg.get("flags", [{} for _ in lengths])
- if self.store_user_data and "user_data" in msg:
+ if "user_data" in msg:
self.user_data = list(msg["user_data"])
+ else:
+ self.user_data = [None] * len(self)
for tokens in self.tokens:
assert len(tokens.shape) == 2, tokens.shape # this should never happen
return self
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 8099abd92..2075c3cc8 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -223,8 +223,10 @@ cdef class Token:
def set_morph(self, features):
cdef hash_t key
- if features is 0:
+ if features is None:
self.c.morph = 0
+ elif isinstance(features, MorphAnalysis):
+ self.morph = features
else:
if isinstance(features, int):
features = self.vocab.strings[features]
diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index 176530a1c..8965c5457 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -1,27 +1,43 @@
-from typing import Callable, Iterator, Dict, List, Tuple, Optional, TYPE_CHECKING
+from typing import Callable, Iterator, Dict, List, Tuple, TYPE_CHECKING
import random
import itertools
import copy
from functools import partial
+from pydantic import BaseModel, StrictStr
from ..util import registry, logger
from ..tokens import Doc
from .example import Example
-from ..lookups import Lookups
-from ..errors import Errors
if TYPE_CHECKING:
from ..language import Language # noqa: F401
+class OrthVariantsSingle(BaseModel):
+ tags: List[StrictStr]
+ variants: List[StrictStr]
+
+
+class OrthVariantsPaired(BaseModel):
+ tags: List[StrictStr]
+ variants: List[List[StrictStr]]
+
+
+class OrthVariants(BaseModel):
+ paired: List[OrthVariantsPaired] = {}
+ single: List[OrthVariantsSingle] = {}
+
+
@registry.augmenters("spacy.orth_variants.v1")
def create_orth_variants_augmenter(
- level: float, lower: float, lookups: Optional[Lookups] = None,
+ level: float, lower: float, orth_variants: OrthVariants,
) -> Callable[["Language", Example], Iterator[Example]]:
"""Create a data augmentation callback that uses orth-variant replacement.
The callback can be added to a corpus or other data iterator during training.
"""
- return partial(orth_variants_augmenter, level=level, lower=lower, lookups=lookups)
+ return partial(
+ orth_variants_augmenter, orth_variants=orth_variants, level=level, lower=lower
+ )
def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
@@ -31,20 +47,11 @@ def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
def orth_variants_augmenter(
nlp: "Language",
example: Example,
+ orth_variants: dict,
*,
level: float = 0.0,
lower: float = 0.0,
- lookups: Optional[Lookups] = None,
) -> Iterator[Example]:
- table_name = "orth_variants"
- if lookups is not None:
- orth_variants = lookups.get_table(table_name, {})
- logger.debug("Using data augmentation orth variants from provided lookups")
- else:
- orth_variants = nlp.vocab.lookups.get_table(table_name, {})
- logger.debug("Using data augmentation orth variants from default vocab lookups")
- if not orth_variants:
- raise ValueError(Errors.E912.format(lang=nlp.lang))
if random.random() >= level:
yield example
else:
@@ -74,13 +81,14 @@ def make_orth_variants(
nlp: "Language",
raw: str,
token_dict: Dict[str, List[str]],
- orth_variants: Dict[str, list],
+ orth_variants: Dict[str, List[Dict[str, List[str]]]],
*,
lower: bool = False,
) -> Tuple[str, Dict[str, List[str]]]:
orig_token_dict = copy.deepcopy(token_dict)
ndsv = orth_variants.get("single", [])
ndpv = orth_variants.get("paired", [])
+ logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants")
words = token_dict.get("words", [])
tags = token_dict.get("tags", [])
# keep unmodified if words or tags are not defined
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 57787cf76..b3ff30e66 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -38,11 +38,11 @@ def create_docbin_reader(
)
-@util.registry.readers("spacy.JsonlReader.v1")
+@util.registry.readers("spacy.JsonlCorpus.v1")
def create_jsonl_reader(
path: Path, min_length: int = 0, max_length: int = 0, limit: int = 0
) -> Callable[["Language"], Iterable[Doc]]:
- return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit)
+ return JsonlCorpus(path, min_length=min_length, max_length=max_length, limit=limit)
@util.registry.readers("spacy.read_labels.v1")
@@ -193,7 +193,7 @@ class Corpus:
break
-class JsonlTexts:
+class JsonlCorpus:
"""Iterate Doc objects from a file or directory of jsonl
formatted raw text files.
@@ -206,7 +206,7 @@ class JsonlTexts:
limit (int): Limit corpus to a subset of examples, e.g. for debugging.
Defaults to 0, which indicates no limit.
- DOCS: https://nightly.spacy.io/api/corpus#jsonltexts
+ DOCS: https://nightly.spacy.io/api/corpus#jsonlcorpus
"""
file_type = "jsonl"
@@ -230,7 +230,7 @@ class JsonlTexts:
nlp (Language): The current nlp object.
YIELDS (Example): The example objects.
- DOCS: https://nightly.spacy.io/api/corpus#jsonltexts-call
+ DOCS: https://nightly.spacy.io/api/corpus#jsonlcorpus-call
"""
for loc in walk_corpus(self.path, ".jsonl"):
records = srsly.read_jsonl(loc)
diff --git a/spacy/util.py b/spacy/util.py
index 8a96ba4fe..f234927d6 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -103,10 +103,6 @@ class registry(thinc.registry):
cli = catalogue.create("spacy", "cli", entry_points=True)
-# We want json loading in the registry, so manually register srsly.read_json.
-registry.readers("srsly.read_json.v0", srsly.read_json)
-
-
class SimpleFrozenDict(dict):
"""Simplified implementation of a frozen dict, mainly used as default
function or method argument (for arguments that should default to empty
diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md
index 58006a19b..986c6f458 100644
--- a/website/docs/api/corpus.md
+++ b/website/docs/api/corpus.md
@@ -100,7 +100,7 @@ Yield examples from the data.
| `nlp` | The current `nlp` object. ~~Language~~ |
| **YIELDS** | The examples. ~~Example~~ |
-## JsonlTexts {#jsonltexts tag="class"}
+## JsonlCorpus {#jsonlcorpus tag="class"}
Iterate Doc objects from a file or directory of JSONL (newline-delimited JSON)
formatted raw text files. Can be used to read the raw text corpus for language
@@ -126,22 +126,22 @@ file.
{"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."}
```
-### JsonlTexts.\_\init\_\_ {#jsonltexts-init tag="method"}
+### JsonlCorpus.\_\init\_\_ {#jsonlcorpus tag="method"}
Initialize the reader.
> #### Example
>
> ```python
-> from spacy.training import JsonlTexts
+> from spacy.training import JsonlCorpus
>
-> corpus = JsonlTexts("./data/texts.jsonl")
+> corpus = JsonlCorpus("./data/texts.jsonl")
> ```
>
> ```ini
> ### Example config
> [corpora.pretrain]
-> @readers = "spacy.JsonlReader.v1"
+> @readers = "spacy.JsonlCorpus.v1"
> path = "corpus/raw_text.jsonl"
> min_length = 0
> max_length = 0
@@ -156,17 +156,17 @@ Initialize the reader.
| `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ |
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
-### JsonlTexts.\_\_call\_\_ {#jsonltexts-call tag="method"}
+### JsonlCorpus.\_\_call\_\_ {#jsonlcorpus-call tag="method"}
Yield examples from the data.
> #### Example
>
> ```python
-> from spacy.training import JsonlTexts
+> from spacy.training import JsonlCorpus
> import spacy
>
-> corpus = JsonlTexts("./texts.jsonl")
+> corpus = JsonlCorpus("./texts.jsonl")
> nlp = spacy.blank("en")
> data = corpus(nlp)
> ```
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 22a0076cd..c1b9bfef4 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -135,7 +135,7 @@ $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy
> path = ${paths:dev}
>
> [corpora.pretrain]
-> @readers = "spacy.JsonlReader.v1"
+> @readers = "spacy.JsonlCorpus.v1"
> path = ${paths.raw}
>
> [corpora.my_custom_data]
@@ -146,7 +146,7 @@ This section defines a **dictionary** mapping of string keys to functions. Each
function takes an `nlp` object and yields [`Example`](/api/example) objects. By
default, the two keys `train` and `dev` are specified and each refer to a
[`Corpus`](/api/top-level#Corpus). When pretraining, an additional `pretrain`
-section is added that defaults to a [`JsonlReader`](/api/top-level#JsonlReader).
+section is added that defaults to a [`JsonlCorpus`](/api/top-level#JsonlCorpus).
You can also register custom functions that return a callable.
| Name | Description |
diff --git a/website/docs/api/docbin.md b/website/docs/api/docbin.md
index 03aff2f6e..3625ed790 100644
--- a/website/docs/api/docbin.md
+++ b/website/docs/api/docbin.md
@@ -47,7 +47,7 @@ Create a `DocBin` object to hold serialized annotations.
| Argument | Description |
| ----------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `attrs` | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. ~~Iterable[str]~~ |
-| `store_user_data` | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. ~~bool~~ |
+| `store_user_data` | Whether to write the `Doc.user_data` and the values of custom extension attributes to file/bytes. Defaults to `False`. ~~bool~~ |
| `docs` | `Doc` objects to add on initialization. ~~Iterable[Doc]~~ |
## DocBin.\_\len\_\_ {#len tag="method"}
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index 9f0612b2b..6257199c9 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -8,8 +8,8 @@ source: spacy/language.py
Usually you'll load this once per process as `nlp` and pass the instance around
your application. The `Language` class is created when you call
[`spacy.load`](/api/top-level#spacy.load) and contains the shared vocabulary and
-[language data](/usage/adding-languages), optional binary weights, e.g. provided
-by a [trained pipeline](/models), and the
+[language data](/usage/linguistic-features#language-data), optional binary
+weights, e.g. provided by a [trained pipeline](/models), and the
[processing pipeline](/usage/processing-pipelines) containing components like
the tagger or parser that are called on a document in order. You can also add
your own processing pipeline components that take a `Doc` object, modify it and
@@ -210,7 +210,9 @@ settings defined in the [`[initialize]`](/api/data-formats#config-initialize)
config block to set up the vocabulary, load in vectors and tok2vec weights and
pass optional arguments to the `initialize` methods implemented by pipeline
components or the tokenizer. This method is typically called automatically when
-you run [`spacy train`](/api/cli#train).
+you run [`spacy train`](/api/cli#train). See the usage guide on the
+[config lifecycle](/usage/training#config-lifecycle) and
+[initialization](/usage/training#initialization) for details.
`get_examples` should be a function that returns an iterable of
[`Example`](/api/example) objects. The data examples can either be the full
@@ -928,7 +930,7 @@ Serialize the current state to a binary string.
Load state from a binary string. Note that this method is commonly used via the
subclasses like `English` or `German` to make language-specific functionality
-like the [lexical attribute getters](/usage/adding-languages#lex-attrs)
+like the [lexical attribute getters](/usage/linguistic-features#language-data)
available to the loaded object.
> #### Example
diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md
index 3693429c4..27ea04432 100644
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@@ -48,12 +48,11 @@ data format used by the lookup and rule-based lemmatizers, see
> nlp.add_pipe("lemmatizer", config=config)
> ```
-| Setting | Description |
-| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
-| `lookups` | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ |
-| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ |
-| `model` | **Not yet implemented:** the model to use. ~~Model~~ |
+| Setting | Description |
+| ----------- | --------------------------------------------------------------------------------- |
+| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
+| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ |
+| `model` | **Not yet implemented:** the model to use. ~~Model~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/lemmatizer.py
@@ -76,15 +75,14 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
-| Name | Description |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab` | The shared vocabulary. ~~Vocab~~ |
-| `model` | **Not yet implemented:** The model to use. ~~Model~~ |
-| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
-| _keyword-only_ | |
-| mode | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
-| lookups | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. ~~Optional[Lookups]~~ |
-| overwrite | Whether to overwrite existing lemmas. ~~bool~ |
+| Name | Description |
+| -------------- | --------------------------------------------------------------------------------------------------- |
+| `vocab` | The shared vocabulary. ~~Vocab~~ |
+| `model` | **Not yet implemented:** The model to use. ~~Model~~ |
+| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
+| _keyword-only_ | |
+| mode | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
+| overwrite | Whether to overwrite existing lemmas. ~~bool~ |
## Lemmatizer.\_\_call\_\_ {#call tag="method"}
@@ -127,11 +125,41 @@ applied to the `Doc` in order.
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
+## Lemmatizer.initialize {#initialize tag="method"}
+
+Initialize the lemmatizer and load any data resources. This method is typically
+called by [`Language.initialize`](/api/language#initialize) and lets you
+customize arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config. The loading only happens during initialization, typically before
+training. At runtime, all data is loaded from disk.
+
+> #### Example
+>
+> ```python
+> lemmatizer = nlp.add_pipe("lemmatizer")
+> lemmatizer.initialize(lookups=lookups)
+> ```
+>
+> ```ini
+> ### config.cfg
+> [initialize.components.lemmatizer]
+>
+> [initialize.components.lemmatizer.lookups]
+> @misc = "load_my_lookups.v1"
+> ```
+
+| Name | Description |
+| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Defaults to `None`. ~~Optional[Callable[[], Iterable[Example]]]~~ |
+| _keyword-only_ | |
+| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
+| `lookups` | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ |
+
## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"}
Lemmatize a token using a lookup-based approach. If no lemma is found, the
-original string is returned. Languages can provide a
-[lookup table](/usage/adding-languages#lemmatizer) via the `Lookups`.
+original string is returned.
| Name | Description |
| ----------- | --------------------------------------------------- |
diff --git a/website/docs/api/token.md b/website/docs/api/token.md
index 068a1d2d2..e7e66e931 100644
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@@ -172,6 +172,25 @@ Get a neighboring token.
| `i` | The relative position of the token to get. Defaults to `1`. ~~int~~ |
| **RETURNS** | The token at position `self.doc[self.i+i]`. ~~Token~~ |
+## Token.set_morph {#set_morph tag="method"}
+
+Set the morphological analysis from a UD FEATS string, hash value of a UD FEATS
+string, features dict or `MorphAnalysis`. The value `None` can be used to reset
+the morph to an unset state.
+
+> #### Example
+>
+> ```python
+> doc = nlp("Give it back! He pleaded.")
+> doc[0].set_morph("Mood=Imp|VerbForm=Fin")
+> assert "Mood=Imp" in doc[0].morph
+> assert doc[0].morph.get("Mood") == ["Imp"]
+> ```
+
+| Name | Description |
+| -------- | --------------------------------------------------------------------------------- |
+| features | The morphological features to set. ~~Union[int, dict, str, MorphAnalysis, None]~~ |
+
## Token.is_ancestor {#is_ancestor tag="method" model="parser"}
Check whether this token is a parent, grandparent, etc. of another in the
@@ -392,74 +411,73 @@ The L2 norm of the token's vector representation.
## Attributes {#attributes}
-| Name | Description |
-| -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `doc` | The parent document. ~~Doc~~ |
-| `lex`