mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-16 11:12:25 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
6cdc090e0e
|
@ -6,7 +6,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.0a42,<8.0.0a50",
|
"thinc>=8.0.0a43,<8.0.0a50",
|
||||||
"blis>=0.4.0,<0.5.0",
|
"blis>=0.4.0,<0.5.0",
|
||||||
"pytokenizations",
|
"pytokenizations",
|
||||||
"pathy"
|
"pathy"
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a42,<8.0.0a50
|
thinc>=8.0.0a43,<8.0.0a50
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
ml_datasets==0.2.0a0
|
ml_datasets==0.2.0a0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.8.0,<1.1.0
|
wasabi>=0.8.0,<1.1.0
|
||||||
srsly>=2.1.0,<3.0.0
|
srsly>=2.3.0,<3.0.0
|
||||||
catalogue>=2.0.1,<2.1.0
|
catalogue>=2.0.1,<2.1.0
|
||||||
typer>=0.3.0,<0.4.0
|
typer>=0.3.0,<0.4.0
|
||||||
pathy
|
pathy
|
||||||
|
|
10
setup.cfg
10
setup.cfg
|
@ -34,16 +34,16 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.0a42,<8.0.0a50
|
thinc>=8.0.0a43,<8.0.0a50
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a42,<8.0.0a50
|
thinc>=8.0.0a43,<8.0.0a50
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
wasabi>=0.8.0,<1.1.0
|
wasabi>=0.8.0,<1.1.0
|
||||||
srsly>=2.1.0,<3.0.0
|
srsly>=2.3.0,<3.0.0
|
||||||
catalogue>=2.0.1,<2.1.0
|
catalogue>=2.0.1,<2.1.0
|
||||||
typer>=0.3.0,<0.4.0
|
typer>=0.3.0,<0.4.0
|
||||||
pathy
|
pathy
|
||||||
|
@ -66,6 +66,8 @@ console_scripts =
|
||||||
[options.extras_require]
|
[options.extras_require]
|
||||||
lookups =
|
lookups =
|
||||||
spacy_lookups_data==1.0.0rc0
|
spacy_lookups_data==1.0.0rc0
|
||||||
|
transformers =
|
||||||
|
spacy_transformers>=1.0.0a17,<1.0.0
|
||||||
cuda =
|
cuda =
|
||||||
cupy>=5.0.0b4,<9.0.0
|
cupy>=5.0.0b4,<9.0.0
|
||||||
cuda80 =
|
cuda80 =
|
||||||
|
@ -84,7 +86,7 @@ cuda102 =
|
||||||
cupy-cuda102>=5.0.0b4,<9.0.0
|
cupy-cuda102>=5.0.0b4,<9.0.0
|
||||||
# Language tokenizers with external dependencies
|
# Language tokenizers with external dependencies
|
||||||
ja =
|
ja =
|
||||||
sudachipy>=0.4.5
|
sudachipy>=0.4.9
|
||||||
sudachidict_core>=20200330
|
sudachidict_core>=20200330
|
||||||
ko =
|
ko =
|
||||||
natto-py==0.9.0
|
natto-py==0.9.0
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a28"
|
__version__ = "3.0.0a29"
|
||||||
__release__ = True
|
__release__ = True
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -37,6 +37,22 @@ tokenizer_config = {"use_fast": true}
|
||||||
window = 128
|
window = 128
|
||||||
stride = 96
|
stride = 96
|
||||||
|
|
||||||
|
{% if "morphologizer" in components %}
|
||||||
|
[components.morphologizer]
|
||||||
|
factory = "morphologizer"
|
||||||
|
|
||||||
|
[components.morphologizer.model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.morphologizer.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.morphologizer.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
{% if "tagger" in components %}
|
{% if "tagger" in components %}
|
||||||
[components.tagger]
|
[components.tagger]
|
||||||
factory = "tagger"
|
factory = "tagger"
|
||||||
|
@ -166,6 +182,19 @@ depth = {{ 4 if optimize == "efficiency" else 8 }}
|
||||||
window_size = 1
|
window_size = 1
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
|
|
||||||
|
{% if "morphologizer" in components %}
|
||||||
|
[components.morphologizer]
|
||||||
|
factory = "morphologizer"
|
||||||
|
|
||||||
|
[components.morphologizer.model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.morphologizer.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
{% if "tagger" in components %}
|
{% if "tagger" in components %}
|
||||||
[components.tagger]
|
[components.tagger]
|
||||||
factory = "tagger"
|
factory = "tagger"
|
||||||
|
@ -257,7 +286,7 @@ no_output_layer = false
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
{% for pipe in components %}
|
{% for pipe in components %}
|
||||||
{% if pipe not in ["tagger", "parser", "ner", "textcat", "entity_linker"] %}
|
{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "entity_linker"] %}
|
||||||
{# Other components defined by the user: we just assume they're factories #}
|
{# Other components defined by the user: we just assume they're factories #}
|
||||||
[components.{{ pipe }}]
|
[components.{{ pipe }}]
|
||||||
factory = "{{ pipe }}"
|
factory = "{{ pipe }}"
|
||||||
|
|
|
@ -34,7 +34,7 @@ learn_rate = 0.001
|
||||||
[corpora]
|
[corpora]
|
||||||
|
|
||||||
[corpora.pretrain]
|
[corpora.pretrain]
|
||||||
@readers = "spacy.JsonlReader.v1"
|
@readers = "spacy.JsonlCorpus.v1"
|
||||||
path = ${paths.raw_text}
|
path = ${paths.raw_text}
|
||||||
min_length = 5
|
min_length = 5
|
||||||
max_length = 500
|
max_length = 500
|
||||||
|
|
|
@ -419,7 +419,7 @@ class Errors:
|
||||||
E164 = ("x is neither increasing nor decreasing: {}.")
|
E164 = ("x is neither increasing nor decreasing: {}.")
|
||||||
E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
|
E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
|
||||||
"that case.")
|
"that case.")
|
||||||
E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
|
E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
|
||||||
"Current DocBin: {current}\nOther DocBin: {other}")
|
"Current DocBin: {current}\nOther DocBin: {other}")
|
||||||
E169 = ("Can't find module: {module}")
|
E169 = ("Can't find module: {module}")
|
||||||
E170 = ("Cannot apply transition {name}: invalid for the current state.")
|
E170 = ("Cannot apply transition {name}: invalid for the current state.")
|
||||||
|
@ -477,12 +477,8 @@ class Errors:
|
||||||
E201 = ("Span index out of range.")
|
E201 = ("Span index out of range.")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
E912 = ("No orth_variants lookups table for data augmentation available for "
|
E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found "
|
||||||
"language '{lang}'. If orth_variants are available in "
|
"for mode '{mode}'. Required tables: {tables}. Found: {found}.")
|
||||||
"spacy-lookups-data, make sure the package is installed and the "
|
|
||||||
"table is loaded in the [initialize.lookups] block of your config. "
|
|
||||||
"Alternatively, you can provide your own Lookups object with a "
|
|
||||||
"table orth_variants as the argument 'lookuos' of the augmenter.")
|
|
||||||
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
|
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
|
||||||
"config.cfg or override it on the CLI?")
|
"config.cfg or override it on the CLI?")
|
||||||
E914 = ("Executing {name} callback failed. Expected the function to "
|
E914 = ("Executing {name} callback failed. Expected the function to "
|
||||||
|
@ -562,10 +558,10 @@ class Errors:
|
||||||
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
||||||
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
|
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
|
||||||
"component.")
|
"component.")
|
||||||
E955 = ("Can't find table(s) '{table}' for language '{lang}' in "
|
E955 = ("Can't find table(s) {table} for language '{lang}' in "
|
||||||
"spacy-lookups-data. If you want to initialize a blank nlp object, "
|
"spacy-lookups-data. Make sure you have the package installed or "
|
||||||
"make sure you have the spacy-lookups-data package installed or "
|
"provide your own lookup tables if no default lookups are available "
|
||||||
"remove the [initialize.lookups] block from your config.")
|
"for your language.")
|
||||||
E956 = ("Can't find component '{name}' in [components] block in the config. "
|
E956 = ("Can't find component '{name}' in [components] block in the config. "
|
||||||
"Available components: {opts}")
|
"Available components: {opts}")
|
||||||
E957 = ("Writing directly to Language.factories isn't needed anymore in "
|
E957 = ("Writing directly to Language.factories isn't needed anymore in "
|
||||||
|
@ -691,9 +687,8 @@ class Errors:
|
||||||
E1002 = ("Span index out of range.")
|
E1002 = ("Span index out of range.")
|
||||||
E1003 = ("Unsupported lemmatizer mode '{mode}'.")
|
E1003 = ("Unsupported lemmatizer mode '{mode}'.")
|
||||||
E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
|
E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
|
||||||
"Required tables '{tables}', found '{found}'. If you are not "
|
"Required tables: {tables}. Found: {found}. Maybe you forgot to "
|
||||||
"providing custom lookups, make sure you have the package "
|
"call nlp.initialize() to load in the data?")
|
||||||
"spacy-lookups-data installed.")
|
|
||||||
E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
|
E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
|
||||||
"'{chunk}'. Tokenizer exceptions are only allowed to specify "
|
"'{chunk}'. Tokenizer exceptions are only allowed to specify "
|
||||||
"`ORTH` and `NORM`.")
|
"`ORTH` and `NORM`.")
|
||||||
|
|
|
@ -4,7 +4,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,18 +23,11 @@ class Bengali(Language):
|
||||||
@Bengali.factory(
|
@Bengali.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
default_config={"model": None, "mode": "rule"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return Lemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Bengali"]
|
__all__ = ["Bengali"]
|
||||||
|
|
|
@ -7,7 +7,6 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .lemmatizer import GreekLemmatizer
|
from .lemmatizer import GreekLemmatizer
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,18 +28,11 @@ class Greek(Language):
|
||||||
@Greek.factory(
|
@Greek.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
default_config={"model": None, "mode": "rule"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return GreekLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = GreekLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return GreekLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Greek"]
|
__all__ = ["Greek"]
|
||||||
|
|
|
@ -8,7 +8,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .lemmatizer import EnglishLemmatizer
|
from .lemmatizer import EnglishLemmatizer
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
|
||||||
|
|
||||||
|
|
||||||
class EnglishDefaults(Language.Defaults):
|
class EnglishDefaults(Language.Defaults):
|
||||||
|
@ -27,18 +26,11 @@ class English(Language):
|
||||||
@English.factory(
|
@English.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
default_config={"model": None, "mode": "rule"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = EnglishLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["English"]
|
__all__ = ["English"]
|
||||||
|
|
|
@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
|
@ -27,18 +26,11 @@ class Persian(Language):
|
||||||
@Persian.factory(
|
@Persian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
default_config={"model": None, "mode": "rule"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return Lemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Persian"]
|
__all__ = ["Persian"]
|
||||||
|
|
|
@ -9,7 +9,6 @@ from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .lemmatizer import FrenchLemmatizer
|
from .lemmatizer import FrenchLemmatizer
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
@ -32,18 +31,11 @@ class French(Language):
|
||||||
@French.factory(
|
@French.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
default_config={"model": None, "mode": "rule"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = FrenchLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["French"]
|
__all__ = ["French"]
|
||||||
|
|
|
@ -6,7 +6,6 @@ from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
|
@ -27,18 +26,11 @@ class Norwegian(Language):
|
||||||
@Norwegian.factory(
|
@Norwegian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
default_config={"model": None, "mode": "rule"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return Lemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Norwegian"]
|
__all__ = ["Norwegian"]
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -8,7 +7,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .lemmatizer import DutchLemmatizer
|
from .lemmatizer import DutchLemmatizer
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,18 +27,11 @@ class Dutch(Language):
|
||||||
@Dutch.factory(
|
@Dutch.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
default_config={"model": None, "mode": "rule"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return DutchLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = DutchLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return DutchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Dutch"]
|
__all__ = ["Dutch"]
|
||||||
|
|
|
@ -34,18 +34,11 @@ class Polish(Language):
|
||||||
@Polish.factory(
|
@Polish.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "pos_lookup", "lookups": None},
|
default_config={"model": None, "mode": "pos_lookup"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return PolishLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = PolishLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return PolishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Polish"]
|
__all__ = ["Polish"]
|
||||||
|
|
|
@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import RussianLemmatizer
|
from .lemmatizer import RussianLemmatizer
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
|
||||||
|
|
||||||
|
|
||||||
class RussianDefaults(Language.Defaults):
|
class RussianDefaults(Language.Defaults):
|
||||||
|
@ -23,17 +22,11 @@ class Russian(Language):
|
||||||
@Russian.factory(
|
@Russian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
|
default_config={"model": None, "mode": "pymorphy2"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return RussianLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
return RussianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Russian"]
|
__all__ = ["Russian"]
|
||||||
|
|
|
@ -5,7 +5,6 @@ from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
|
@ -30,18 +29,11 @@ class Swedish(Language):
|
||||||
@Swedish.factory(
|
@Swedish.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
default_config={"model": None, "mode": "rule"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return Lemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Swedish"]
|
__all__ = ["Swedish"]
|
||||||
|
|
|
@ -7,7 +7,6 @@ from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import UkrainianLemmatizer
|
from .lemmatizer import UkrainianLemmatizer
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
|
||||||
|
|
||||||
|
|
||||||
class UkrainianDefaults(Language.Defaults):
|
class UkrainianDefaults(Language.Defaults):
|
||||||
|
@ -24,17 +23,11 @@ class Ukrainian(Language):
|
||||||
@Ukrainian.factory(
|
@Ukrainian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
|
default_config={"model": None, "mode": "pymorphy2"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Ukrainian"]
|
__all__ = ["Ukrainian"]
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
from typing import List, Union, Callable, Tuple
|
from typing import List, Union, Callable, Tuple
|
||||||
from thinc.types import Ints2d, Doc
|
from thinc.types import Ints2d
|
||||||
from thinc.api import Model, registry
|
from thinc.api import Model, registry
|
||||||
|
|
||||||
|
from ..tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
@registry.layers("spacy.FeatureExtractor.v1")
|
@registry.layers("spacy.FeatureExtractor.v1")
|
||||||
|
@ -9,7 +10,9 @@ def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[In
|
||||||
return Model("extract_features", forward, attrs={"columns": columns})
|
return Model("extract_features", forward, attrs={"columns": columns})
|
||||||
|
|
||||||
|
|
||||||
def forward(model: Model[List[Doc], List[Ints2d]], docs, is_train: bool) -> Tuple[List[Ints2d], Callable]:
|
def forward(
|
||||||
|
model: Model[List[Doc], List[Ints2d]], docs, is_train: bool
|
||||||
|
) -> Tuple[List[Ints2d], Callable]:
|
||||||
columns = model.attrs["columns"]
|
columns = model.attrs["columns"]
|
||||||
features: List[Ints2d] = []
|
features: List[Ints2d] = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional, List
|
from typing import Optional, List, Union
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
||||||
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
|
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
|
||||||
|
@ -10,7 +10,7 @@ from ...ml import _character_embed
|
||||||
from ..staticvectors import StaticVectors
|
from ..staticvectors import StaticVectors
|
||||||
from ..featureextractor import FeatureExtractor
|
from ..featureextractor import FeatureExtractor
|
||||||
from ...pipeline.tok2vec import Tok2VecListener
|
from ...pipeline.tok2vec import Tok2VecListener
|
||||||
from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
|
from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
||||||
|
@ -98,7 +98,7 @@ def MultiHashEmbed(
|
||||||
attributes using hash embedding, concatenates the results, and passes it
|
attributes using hash embedding, concatenates the results, and passes it
|
||||||
through a feed-forward subnetwork to build a mixed representations.
|
through a feed-forward subnetwork to build a mixed representations.
|
||||||
|
|
||||||
The features used are the NORM, PREFIX, SUFFIX and SHAPE, which can have
|
The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have
|
||||||
varying definitions depending on the Vocab of the Doc object passed in.
|
varying definitions depending on the Vocab of the Doc object passed in.
|
||||||
Vectors from pretrained static vectors can also be incorporated into the
|
Vectors from pretrained static vectors can also be incorporated into the
|
||||||
concatenated representation.
|
concatenated representation.
|
||||||
|
@ -115,7 +115,7 @@ def MultiHashEmbed(
|
||||||
also_use_static_vectors (bool): Whether to also use static word vectors.
|
also_use_static_vectors (bool): Whether to also use static word vectors.
|
||||||
Requires a vectors table to be loaded in the Doc objects' vocab.
|
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||||
"""
|
"""
|
||||||
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
seed = 7
|
seed = 7
|
||||||
|
|
||||||
def make_hash_embed(feature):
|
def make_hash_embed(feature):
|
||||||
|
@ -123,7 +123,7 @@ def MultiHashEmbed(
|
||||||
seed += 1
|
seed += 1
|
||||||
return HashEmbed(
|
return HashEmbed(
|
||||||
width,
|
width,
|
||||||
rows if feature == NORM else rows // 2,
|
rows if feature == LOWER else rows // 2,
|
||||||
column=cols.index(feature),
|
column=cols.index(feature),
|
||||||
seed=seed,
|
seed=seed,
|
||||||
dropout=0.0,
|
dropout=0.0,
|
||||||
|
@ -131,13 +131,13 @@ def MultiHashEmbed(
|
||||||
|
|
||||||
if also_embed_subwords:
|
if also_embed_subwords:
|
||||||
embeddings = [
|
embeddings = [
|
||||||
make_hash_embed(NORM),
|
make_hash_embed(LOWER),
|
||||||
make_hash_embed(PREFIX),
|
make_hash_embed(PREFIX),
|
||||||
make_hash_embed(SUFFIX),
|
make_hash_embed(SUFFIX),
|
||||||
make_hash_embed(SHAPE),
|
make_hash_embed(SHAPE),
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
embeddings = [make_hash_embed(NORM)]
|
embeddings = [make_hash_embed(LOWER)]
|
||||||
concat_size = width * (len(embeddings) + also_use_static_vectors)
|
concat_size = width * (len(embeddings) + also_use_static_vectors)
|
||||||
if also_use_static_vectors:
|
if also_use_static_vectors:
|
||||||
model = chain(
|
model = chain(
|
||||||
|
@ -165,7 +165,8 @@ def MultiHashEmbed(
|
||||||
|
|
||||||
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
||||||
def CharacterEmbed(
|
def CharacterEmbed(
|
||||||
width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool
|
width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool,
|
||||||
|
feature: Union[int, str]="LOWER"
|
||||||
):
|
):
|
||||||
"""Construct an embedded representation based on character embeddings, using
|
"""Construct an embedded representation based on character embeddings, using
|
||||||
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
||||||
|
@ -179,12 +180,13 @@ def CharacterEmbed(
|
||||||
of being in an arbitrary position depending on the word length.
|
of being in an arbitrary position depending on the word length.
|
||||||
|
|
||||||
The characters are embedded in a embedding table with a given number of rows,
|
The characters are embedded in a embedding table with a given number of rows,
|
||||||
and the vectors concatenated. A hash-embedded vector of the NORM of the word is
|
and the vectors concatenated. A hash-embedded vector of the LOWER of the word is
|
||||||
also concatenated on, and the result is then passed through a feed-forward
|
also concatenated on, and the result is then passed through a feed-forward
|
||||||
network to construct a single vector to represent the information.
|
network to construct a single vector to represent the information.
|
||||||
|
|
||||||
width (int): The width of the output vector and the NORM hash embedding.
|
feature (int or str): An attribute to embed, to concatenate with the characters.
|
||||||
rows (int): The number of rows in the NORM hash embedding table.
|
width (int): The width of the output vector and the feature embedding.
|
||||||
|
rows (int): The number of rows in the LOWER hash embedding table.
|
||||||
nM (int): The dimensionality of the character embeddings. Recommended values
|
nM (int): The dimensionality of the character embeddings. Recommended values
|
||||||
are between 16 and 64.
|
are between 16 and 64.
|
||||||
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
||||||
|
@ -193,12 +195,15 @@ def CharacterEmbed(
|
||||||
also_use_static_vectors (bool): Whether to also use static word vectors.
|
also_use_static_vectors (bool): Whether to also use static word vectors.
|
||||||
Requires a vectors table to be loaded in the Doc objects' vocab.
|
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||||
"""
|
"""
|
||||||
|
feature = intify_attr(feature)
|
||||||
|
if feature is None:
|
||||||
|
raise ValueError("Invalid feature: Must be a token attribute.")
|
||||||
if also_use_static_vectors:
|
if also_use_static_vectors:
|
||||||
model = chain(
|
model = chain(
|
||||||
concatenate(
|
concatenate(
|
||||||
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||||
chain(
|
chain(
|
||||||
FeatureExtractor([NORM]),
|
FeatureExtractor([feature]),
|
||||||
list2ragged(),
|
list2ragged(),
|
||||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||||
),
|
),
|
||||||
|
@ -214,7 +219,7 @@ def CharacterEmbed(
|
||||||
concatenate(
|
concatenate(
|
||||||
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||||
chain(
|
chain(
|
||||||
FeatureExtractor([NORM]),
|
FeatureExtractor([feature]),
|
||||||
list2ragged(),
|
list2ragged(),
|
||||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||||
),
|
),
|
||||||
|
|
|
@ -1,26 +1,25 @@
|
||||||
from typing import Optional, List, Dict, Any
|
from typing import Optional, List, Dict, Any, Callable, Iterable, Iterator, Union
|
||||||
|
from typing import Tuple
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
from ..training import Example
|
||||||
from ..lookups import Lookups, load_lookups
|
from ..lookups import Lookups, load_lookups
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..tokens import Doc, Token
|
from ..tokens import Doc, Token
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..training import validate_examples
|
from ..training import validate_examples
|
||||||
|
from ..util import logger, SimpleFrozenList
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={
|
default_config={"model": None, "mode": "lookup", "overwrite": False},
|
||||||
"model": None,
|
|
||||||
"mode": "lookup",
|
|
||||||
"lookups": None,
|
|
||||||
"overwrite": False,
|
|
||||||
},
|
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
|
@ -28,13 +27,9 @@ def make_lemmatizer(
|
||||||
model: Optional[Model],
|
model: Optional[Model],
|
||||||
name: str,
|
name: str,
|
||||||
mode: str,
|
mode: str,
|
||||||
lookups: Optional[Lookups],
|
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
):
|
):
|
||||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
return Lemmatizer(
|
|
||||||
nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class Lemmatizer(Pipe):
|
class Lemmatizer(Pipe):
|
||||||
|
@ -46,59 +41,19 @@ class Lemmatizer(Pipe):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_lookups_config(cls, mode: str) -> Dict:
|
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
||||||
"""Returns the lookups configuration settings for a given mode for use
|
"""Returns the lookups configuration settings for a given mode for use
|
||||||
in Lemmatizer.load_lookups.
|
in Lemmatizer.load_lookups.
|
||||||
|
|
||||||
mode (str): The lemmatizer mode.
|
mode (str): The lemmatizer mode.
|
||||||
RETURNS (dict): The lookups configuration settings for this mode.
|
RETURNS (Tuple[List[str], List[str]]): The required and optional
|
||||||
|
lookup tables for this mode.
|
||||||
DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
|
|
||||||
"""
|
"""
|
||||||
if mode == "lookup":
|
if mode == "lookup":
|
||||||
return {
|
return (["lemma_lookup"], [])
|
||||||
"required_tables": ["lemma_lookup"],
|
|
||||||
}
|
|
||||||
elif mode == "rule":
|
elif mode == "rule":
|
||||||
return {
|
return (["lemma_rules"], ["lemma_exc", "lemma_index"])
|
||||||
"required_tables": ["lemma_rules"],
|
return ([], [])
|
||||||
"optional_tables": ["lemma_exc", "lemma_index"],
|
|
||||||
}
|
|
||||||
return {}
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups]) -> Lookups:
|
|
||||||
"""Load and validate lookups tables. If the provided lookups is None,
|
|
||||||
load the default lookups tables according to the language and mode
|
|
||||||
settings. Confirm that all required tables for the language and mode
|
|
||||||
are present.
|
|
||||||
|
|
||||||
lang (str): The language code.
|
|
||||||
mode (str): The lemmatizer mode.
|
|
||||||
lookups (Lookups): The provided lookups, may be None if the default
|
|
||||||
lookups should be loaded.
|
|
||||||
RETURNS (Lookups): The Lookups object.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
|
|
||||||
"""
|
|
||||||
config = cls.get_lookups_config(mode)
|
|
||||||
required_tables = config.get("required_tables", [])
|
|
||||||
optional_tables = config.get("optional_tables", [])
|
|
||||||
if lookups is None:
|
|
||||||
lookups = load_lookups(lang=lang, tables=required_tables)
|
|
||||||
optional_lookups = load_lookups(
|
|
||||||
lang=lang, tables=optional_tables, strict=False
|
|
||||||
)
|
|
||||||
for table in optional_lookups.tables:
|
|
||||||
lookups.set_table(table, optional_lookups.get_table(table))
|
|
||||||
for table in required_tables:
|
|
||||||
if table not in lookups:
|
|
||||||
raise ValueError(
|
|
||||||
Errors.E1004.format(
|
|
||||||
mode=mode, tables=required_tables, found=lookups.tables
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return lookups
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -107,7 +62,6 @@ class Lemmatizer(Pipe):
|
||||||
name: str = "lemmatizer",
|
name: str = "lemmatizer",
|
||||||
*,
|
*,
|
||||||
mode: str = "lookup",
|
mode: str = "lookup",
|
||||||
lookups: Optional[Lookups] = None,
|
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize a Lemmatizer.
|
"""Initialize a Lemmatizer.
|
||||||
|
@ -116,9 +70,6 @@ class Lemmatizer(Pipe):
|
||||||
model (Model): A model (not yet implemented).
|
model (Model): A model (not yet implemented).
|
||||||
name (str): The component name. Defaults to "lemmatizer".
|
name (str): The component name. Defaults to "lemmatizer".
|
||||||
mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
|
mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
|
||||||
lookups (Lookups): The lookups object containing the (optional) tables
|
|
||||||
such as "lemma_rules", "lemma_index", "lemma_exc" and
|
|
||||||
"lemma_lookup". Defaults to None
|
|
||||||
overwrite (bool): Whether to overwrite existing lemmas. Defaults to
|
overwrite (bool): Whether to overwrite existing lemmas. Defaults to
|
||||||
`False`.
|
`False`.
|
||||||
|
|
||||||
|
@ -128,8 +79,9 @@ class Lemmatizer(Pipe):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.name = name
|
self.name = name
|
||||||
self._mode = mode
|
self._mode = mode
|
||||||
self.lookups = lookups if lookups is not None else Lookups()
|
self.lookups = Lookups()
|
||||||
self.overwrite = overwrite
|
self.overwrite = overwrite
|
||||||
|
self._validated = False
|
||||||
if self.mode == "lookup":
|
if self.mode == "lookup":
|
||||||
self.lemmatize = self.lookup_lemmatize
|
self.lemmatize = self.lookup_lemmatize
|
||||||
elif self.mode == "rule":
|
elif self.mode == "rule":
|
||||||
|
@ -153,12 +105,56 @@ class Lemmatizer(Pipe):
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/lemmatizer#call
|
DOCS: https://nightly.spacy.io/api/lemmatizer#call
|
||||||
"""
|
"""
|
||||||
|
if not self._validated:
|
||||||
|
self._validate_tables(Errors.E1004)
|
||||||
for token in doc:
|
for token in doc:
|
||||||
if self.overwrite or token.lemma == 0:
|
if self.overwrite or token.lemma == 0:
|
||||||
token.lemma_ = self.lemmatize(token)[0]
|
token.lemma_ = self.lemmatize(token)[0]
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, *, batch_size=128):
|
def initialize(
|
||||||
|
self,
|
||||||
|
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||||
|
*,
|
||||||
|
nlp: Optional[Language] = None,
|
||||||
|
lookups: Optional[Lookups] = None,
|
||||||
|
):
|
||||||
|
"""Initialize the lemmatizer and load in data.
|
||||||
|
|
||||||
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
|
returns a representative sample of gold-standard Example objects.
|
||||||
|
nlp (Language): The current nlp object the component is part of.
|
||||||
|
lookups (Lookups): The lookups object containing the (optional) tables
|
||||||
|
such as "lemma_rules", "lemma_index", "lemma_exc" and
|
||||||
|
"lemma_lookup". Defaults to None.
|
||||||
|
"""
|
||||||
|
required_tables, optional_tables = self.get_lookups_config(self.mode)
|
||||||
|
if lookups is None:
|
||||||
|
logger.debug("Lemmatizer: loading tables from spacy-lookups-data")
|
||||||
|
lookups = load_lookups(lang=self.vocab.lang, tables=required_tables)
|
||||||
|
optional_lookups = load_lookups(
|
||||||
|
lang=self.vocab.lang, tables=optional_tables, strict=False
|
||||||
|
)
|
||||||
|
for table in optional_lookups.tables:
|
||||||
|
lookups.set_table(table, optional_lookups.get_table(table))
|
||||||
|
self.lookups = lookups
|
||||||
|
self._validate_tables(Errors.E1004)
|
||||||
|
|
||||||
|
def _validate_tables(self, error_message: str = Errors.E912) -> None:
|
||||||
|
"""Check that the lookups are correct for the current mode."""
|
||||||
|
required_tables, optional_tables = self.get_lookups_config(self.mode)
|
||||||
|
for table in required_tables:
|
||||||
|
if table not in self.lookups:
|
||||||
|
raise ValueError(
|
||||||
|
error_message.format(
|
||||||
|
mode=self.mode,
|
||||||
|
tables=required_tables,
|
||||||
|
found=self.lookups.tables,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self._validated = True
|
||||||
|
|
||||||
|
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
||||||
"""Apply the pipe to a stream of documents. This usually happens under
|
"""Apply the pipe to a stream of documents. This usually happens under
|
||||||
the hood when the nlp object is called on a text and all components are
|
the hood when the nlp object is called on a text and all components are
|
||||||
applied to the Doc.
|
applied to the Doc.
|
||||||
|
@ -263,7 +259,7 @@ class Lemmatizer(Pipe):
|
||||||
"""
|
"""
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def score(self, examples, **kwargs) -> Dict[str, Any]:
|
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
"""Score a batch of examples.
|
"""Score a batch of examples.
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
examples (Iterable[Example]): The examples to score.
|
||||||
|
@ -274,58 +270,66 @@ class Lemmatizer(Pipe):
|
||||||
validate_examples(examples, "Lemmatizer.score")
|
validate_examples(examples, "Lemmatizer.score")
|
||||||
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
||||||
|
|
||||||
def to_disk(self, path, *, exclude=tuple()):
|
def to_disk(
|
||||||
"""Save the current state to a directory.
|
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||||
|
):
|
||||||
|
"""Serialize the pipe to disk.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory, which will be created if
|
path (str / Path): Path to a directory.
|
||||||
it doesn't exist.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
exclude (list): String names of serialization fields to exclude.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/vocab#to_disk
|
DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||||
serialize["lookups"] = lambda p: self.lookups.to_disk(p)
|
serialize["lookups"] = lambda p: self.lookups.to_disk(p)
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, *, exclude=tuple()):
|
def from_disk(
|
||||||
"""Loads state from a directory. Modifies the object in place and
|
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||||
returns it.
|
) -> "Lemmatizer":
|
||||||
|
"""Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory.
|
path (str / Path): Path to a directory.
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Vocab): The modified `Vocab` object.
|
RETURNS (Lemmatizer): The modified Lemmatizer object.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/vocab#to_disk
|
DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk
|
||||||
"""
|
"""
|
||||||
deserialize = {}
|
deserialize = {}
|
||||||
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
||||||
deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
|
deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
|
self._validate_tables()
|
||||||
|
return self
|
||||||
|
|
||||||
def to_bytes(self, *, exclude=tuple()) -> bytes:
|
def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
|
||||||
"""Serialize the current state to a binary string.
|
"""Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized form of the `Vocab` object.
|
RETURNS (bytes): The serialized object.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/vocab#to_bytes
|
DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
serialize["vocab"] = self.vocab.to_bytes
|
||||||
serialize["lookups"] = self.lookups.to_bytes
|
serialize["lookups"] = self.lookups.to_bytes
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data: bytes, *, exclude=tuple()):
|
def from_bytes(
|
||||||
"""Load state from a binary string.
|
self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
||||||
|
) -> "Lemmatizer":
|
||||||
|
"""Load the pipe from a bytestring.
|
||||||
|
|
||||||
bytes_data (bytes): The data to load from.
|
bytes_data (bytes): The serialized pipe.
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Vocab): The `Vocab` object.
|
RETURNS (Lemmatizer): The loaded Lemmatizer.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/vocab#from_bytes
|
DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes
|
||||||
"""
|
"""
|
||||||
deserialize = {}
|
deserialize = {}
|
||||||
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
||||||
deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
|
deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
|
self._validate_tables()
|
||||||
|
return self
|
||||||
|
|
|
@ -282,7 +282,7 @@ class ModelMetaSchema(BaseModel):
|
||||||
sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources")
|
sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources")
|
||||||
vectors: Dict[str, Any] = Field({}, title="Included word vectors")
|
vectors: Dict[str, Any] = Field({}, title="Included word vectors")
|
||||||
labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name")
|
labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name")
|
||||||
performance: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy and speed numbers")
|
performance: Dict[str, Union[float, Dict[str, Union[float, dict]]]] = Field({}, title="Accuracy and speed numbers")
|
||||||
spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used")
|
spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
|
@ -77,7 +77,7 @@ def test_morph_property(tokenizer):
|
||||||
assert doc.to_array(["MORPH"])[0] != 0
|
assert doc.to_array(["MORPH"])[0] != 0
|
||||||
|
|
||||||
# unset with token.morph
|
# unset with token.morph
|
||||||
doc[0].set_morph(0)
|
doc[0].set_morph(None)
|
||||||
assert doc.to_array(["MORPH"])[0] == 0
|
assert doc.to_array(["MORPH"])[0] == 0
|
||||||
|
|
||||||
# empty morph is equivalent to "_"
|
# empty morph is equivalent to "_"
|
||||||
|
|
|
@ -17,16 +17,31 @@ def test_lemmatizer_initialize(lang, capfd):
|
||||||
@registry.misc("lemmatizer_init_lookups")
|
@registry.misc("lemmatizer_init_lookups")
|
||||||
def lemmatizer_init_lookups():
|
def lemmatizer_init_lookups():
|
||||||
lookups = Lookups()
|
lookups = Lookups()
|
||||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
lookups.add_table("lemma_lookup", {"cope": "cope", "x": "y"})
|
||||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
||||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
||||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
||||||
return lookups
|
return lookups
|
||||||
|
|
||||||
"""Test that languages can be initialized."""
|
# Test that languages can be initialized
|
||||||
nlp = get_lang_class(lang)()
|
nlp = get_lang_class(lang)()
|
||||||
nlp.add_pipe("lemmatizer", config={"lookups": {"@misc": "lemmatizer_init_lookups"}})
|
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
||||||
|
assert not lemmatizer.lookups.tables
|
||||||
|
nlp.config["initialize"]["components"]["lemmatizer"] = {
|
||||||
|
"lookups": {"@misc": "lemmatizer_init_lookups"}
|
||||||
|
}
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp("x")
|
||||||
|
nlp.initialize()
|
||||||
|
assert lemmatizer.lookups.tables
|
||||||
|
doc = nlp("x")
|
||||||
# Check for stray print statements (see #3342)
|
# Check for stray print statements (see #3342)
|
||||||
doc = nlp("test") # noqa: F841
|
|
||||||
captured = capfd.readouterr()
|
captured = capfd.readouterr()
|
||||||
assert not captured.out
|
assert not captured.out
|
||||||
|
assert doc[0].lemma_ == "y"
|
||||||
|
|
||||||
|
# Test initialization by calling .initialize() directly
|
||||||
|
nlp = get_lang_class(lang)()
|
||||||
|
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
||||||
|
lemmatizer.initialize(lookups=lemmatizer_init_lookups())
|
||||||
|
assert nlp("x")[0].lemma_ == "y"
|
||||||
|
|
|
@ -8,61 +8,52 @@ from ..util import make_tempdir
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def nlp():
|
def nlp():
|
||||||
return English()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def lemmatizer(nlp):
|
|
||||||
@registry.misc("cope_lookups")
|
@registry.misc("cope_lookups")
|
||||||
def cope_lookups():
|
def cope_lookups():
|
||||||
lookups = Lookups()
|
lookups = Lookups()
|
||||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
|
||||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
||||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
||||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
||||||
return lookups
|
return lookups
|
||||||
|
|
||||||
lemmatizer = nlp.add_pipe(
|
nlp = English()
|
||||||
"lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
|
nlp.config["initialize"]["components"]["lemmatizer"] = {
|
||||||
)
|
"lookups": {"@misc": "cope_lookups"}
|
||||||
return lemmatizer
|
}
|
||||||
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def test_lemmatizer_init(nlp):
|
def test_lemmatizer_init(nlp):
|
||||||
@registry.misc("cope_lookups")
|
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
||||||
def cope_lookups():
|
|
||||||
lookups = Lookups()
|
|
||||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
|
||||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
|
||||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
|
||||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
|
||||||
return lookups
|
|
||||||
|
|
||||||
lemmatizer = nlp.add_pipe(
|
|
||||||
"lemmatizer", config={"mode": "lookup", "lookups": {"@misc": "cope_lookups"}}
|
|
||||||
)
|
|
||||||
assert isinstance(lemmatizer.lookups, Lookups)
|
assert isinstance(lemmatizer.lookups, Lookups)
|
||||||
|
assert not lemmatizer.lookups.tables
|
||||||
assert lemmatizer.mode == "lookup"
|
assert lemmatizer.mode == "lookup"
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp("test")
|
||||||
|
nlp.initialize()
|
||||||
|
assert lemmatizer.lookups.tables
|
||||||
|
assert nlp("cope")[0].lemma_ == "cope"
|
||||||
|
assert nlp("coped")[0].lemma_ == "cope"
|
||||||
# replace any tables from spacy-lookups-data
|
# replace any tables from spacy-lookups-data
|
||||||
lemmatizer.lookups = Lookups()
|
lemmatizer.lookups = Lookups()
|
||||||
doc = nlp("coping")
|
|
||||||
# lookup with no tables sets text as lemma
|
# lookup with no tables sets text as lemma
|
||||||
assert doc[0].lemma_ == "coping"
|
assert nlp("cope")[0].lemma_ == "cope"
|
||||||
|
assert nlp("coped")[0].lemma_ == "coped"
|
||||||
nlp.remove_pipe("lemmatizer")
|
nlp.remove_pipe("lemmatizer")
|
||||||
|
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
||||||
@registry.misc("empty_lookups")
|
|
||||||
def empty_lookups():
|
|
||||||
return Lookups()
|
|
||||||
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.add_pipe(
|
# Can't initialize without required tables
|
||||||
"lemmatizer",
|
lemmatizer.initialize(lookups=Lookups())
|
||||||
config={"mode": "lookup", "lookups": {"@misc": "empty_lookups"}},
|
lookups = Lookups()
|
||||||
)
|
lookups.add_table("lemma_lookup", {})
|
||||||
|
lemmatizer.initialize(lookups=lookups)
|
||||||
|
|
||||||
|
|
||||||
def test_lemmatizer_config(nlp, lemmatizer):
|
def test_lemmatizer_config(nlp):
|
||||||
|
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
|
||||||
|
nlp.initialize()
|
||||||
|
|
||||||
doc = nlp.make_doc("coping")
|
doc = nlp.make_doc("coping")
|
||||||
doc[0].pos_ = "VERB"
|
doc[0].pos_ = "VERB"
|
||||||
assert doc[0].lemma_ == ""
|
assert doc[0].lemma_ == ""
|
||||||
|
@ -78,20 +69,21 @@ def test_lemmatizer_config(nlp, lemmatizer):
|
||||||
assert doc[0].lemma_ == "cope"
|
assert doc[0].lemma_ == "cope"
|
||||||
|
|
||||||
|
|
||||||
def test_lemmatizer_serialize(nlp, lemmatizer):
|
def test_lemmatizer_serialize(nlp):
|
||||||
@registry.misc("cope_lookups")
|
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
|
||||||
|
nlp.initialize()
|
||||||
|
|
||||||
def cope_lookups():
|
def cope_lookups():
|
||||||
lookups = Lookups()
|
lookups = Lookups()
|
||||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
|
||||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
||||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
||||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
||||||
return lookups
|
return lookups
|
||||||
|
|
||||||
nlp2 = English()
|
nlp2 = English()
|
||||||
lemmatizer2 = nlp2.add_pipe(
|
lemmatizer2 = nlp2.add_pipe("lemmatizer", config={"mode": "rule"})
|
||||||
"lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
|
lemmatizer2.initialize(lookups=cope_lookups())
|
||||||
)
|
|
||||||
lemmatizer2.from_bytes(lemmatizer.to_bytes())
|
lemmatizer2.from_bytes(lemmatizer.to_bytes())
|
||||||
assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
|
assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
|
||||||
assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables
|
assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables
|
||||||
|
@ -100,9 +92,9 @@ def test_lemmatizer_serialize(nlp, lemmatizer):
|
||||||
with make_tempdir() as tmp_dir:
|
with make_tempdir() as tmp_dir:
|
||||||
nlp.to_disk(tmp_dir)
|
nlp.to_disk(tmp_dir)
|
||||||
nlp2 = util.load_model_from_path(tmp_dir)
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
doc2 = nlp2.make_doc("coping")
|
doc2 = nlp2.make_doc("coping")
|
||||||
doc2[0].pos_ = "VERB"
|
doc2[0].pos_ = "VERB"
|
||||||
assert doc2[0].lemma_ == ""
|
assert doc2[0].lemma_ == ""
|
||||||
doc2 = lemmatizer(doc2)
|
doc2 = lemmatizer(doc2)
|
||||||
assert doc2[0].text == "coping"
|
assert doc2[0].text == "coping"
|
||||||
assert doc2[0].lemma_ == "cope"
|
assert doc2[0].lemma_ == "cope"
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
import pytest
|
||||||
|
from spacy.tokens.doc import Underscore
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.tokens import Doc, DocBin
|
from spacy.tokens import Doc, DocBin
|
||||||
|
@ -86,3 +89,20 @@ def test_serialize_doc_bin_unknown_spaces(en_vocab):
|
||||||
assert re_doc1.text == "that 's "
|
assert re_doc1.text == "that 's "
|
||||||
assert not re_doc2.has_unknown_spaces
|
assert not re_doc2.has_unknown_spaces
|
||||||
assert re_doc2.text == "that's"
|
assert re_doc2.text == "that's"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"writer_flag,reader_flag,reader_value", [(True, True, "bar"), (True, False, "bar"), (False, True, "nothing"), (False, False, "nothing")]
|
||||||
|
)
|
||||||
|
def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag, reader_value):
|
||||||
|
"""Test that custom extensions are correctly serialized in DocBin."""
|
||||||
|
Doc.set_extension("foo", default="nothing")
|
||||||
|
doc = Doc(en_vocab, words=["hello", "world"])
|
||||||
|
doc._.foo = "bar"
|
||||||
|
doc_bin_1 = DocBin(store_user_data=writer_flag)
|
||||||
|
doc_bin_1.add(doc)
|
||||||
|
doc_bin_bytes = doc_bin_1.to_bytes()
|
||||||
|
doc_bin_2 = DocBin(store_user_data=reader_flag).from_bytes(doc_bin_bytes)
|
||||||
|
doc_2 = list(doc_bin_2.get_docs(en_vocab))[0]
|
||||||
|
assert doc_2._.foo == reader_value
|
||||||
|
Underscore.doc_extensions = {}
|
||||||
|
|
|
@ -7,11 +7,11 @@ from spacy.training.converters import json_to_docs
|
||||||
from spacy.training.augment import create_orth_variants_augmenter
|
from spacy.training.augment import create_orth_variants_augmenter
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.tokens import Doc, DocBin
|
from spacy.tokens import Doc, DocBin
|
||||||
from spacy.lookups import Lookups
|
|
||||||
from spacy.util import get_words_and_spaces, minibatch
|
from spacy.util import get_words_and_spaces, minibatch
|
||||||
from thinc.api import compounding
|
from thinc.api import compounding
|
||||||
import pytest
|
import pytest
|
||||||
import srsly
|
import srsly
|
||||||
|
import random
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
|
||||||
|
@ -504,9 +504,9 @@ def test_make_orth_variants(doc):
|
||||||
{"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
|
{"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
lookups = Lookups()
|
augmenter = create_orth_variants_augmenter(
|
||||||
lookups.add_table("orth_variants", orth_variants)
|
level=0.2, lower=0.5, orth_variants=orth_variants
|
||||||
augmenter = create_orth_variants_augmenter(level=0.2, lower=0.5, lookups=lookups)
|
)
|
||||||
with make_tempdir() as tmpdir:
|
with make_tempdir() as tmpdir:
|
||||||
output_file = tmpdir / "roundtrip.spacy"
|
output_file = tmpdir / "roundtrip.spacy"
|
||||||
DocBin(docs=[doc]).to_disk(output_file)
|
DocBin(docs=[doc]).to_disk(output_file)
|
||||||
|
@ -515,6 +515,39 @@ def test_make_orth_variants(doc):
|
||||||
list(reader(nlp))
|
list(reader(nlp))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_custom_data_augmentation(doc):
|
||||||
|
def create_spongebob_augmenter(randomize: bool = False):
|
||||||
|
def augment(nlp, example):
|
||||||
|
text = example.text
|
||||||
|
if randomize:
|
||||||
|
ch = [c.lower() if random.random() < 0.5 else c.upper() for c in text]
|
||||||
|
else:
|
||||||
|
ch = [c.lower() if i % 2 else c.upper() for i, c in enumerate(text)]
|
||||||
|
example_dict = example.to_dict()
|
||||||
|
doc = nlp.make_doc("".join(ch))
|
||||||
|
example_dict["token_annotation"]["ORTH"] = [t.text for t in doc]
|
||||||
|
yield example
|
||||||
|
yield example.from_dict(doc, example_dict)
|
||||||
|
|
||||||
|
return augment
|
||||||
|
|
||||||
|
nlp = English()
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
output_file = tmpdir / "roundtrip.spacy"
|
||||||
|
DocBin(docs=[doc]).to_disk(output_file)
|
||||||
|
reader = Corpus(output_file, augmenter=create_spongebob_augmenter())
|
||||||
|
corpus = list(reader(nlp))
|
||||||
|
orig_text = "Sarah 's sister flew to Silicon Valley via London . "
|
||||||
|
augmented = "SaRaH 's sIsTeR FlEw tO SiLiCoN VaLlEy vIa lOnDoN . "
|
||||||
|
assert corpus[0].text == orig_text
|
||||||
|
assert corpus[0].reference.text == orig_text
|
||||||
|
assert corpus[0].predicted.text == orig_text
|
||||||
|
assert corpus[1].text == augmented
|
||||||
|
assert corpus[1].reference.text == augmented
|
||||||
|
assert corpus[1].predicted.text == augmented
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip("Outdated")
|
@pytest.mark.skip("Outdated")
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"tokens_a,tokens_b,expected",
|
"tokens_a,tokens_b,expected",
|
||||||
|
|
|
@ -58,7 +58,7 @@ class DocBin:
|
||||||
|
|
||||||
attrs (Iterable[str]): List of attributes to serialize. 'orth' and
|
attrs (Iterable[str]): List of attributes to serialize. 'orth' and
|
||||||
'spacy' are always serialized, so they're not required.
|
'spacy' are always serialized, so they're not required.
|
||||||
store_user_data (bool): Whether to include the `Doc.user_data`.
|
store_user_data (bool): Whether to write the `Doc.user_data` to bytes/file.
|
||||||
docs (Iterable[Doc]): Docs to add.
|
docs (Iterable[Doc]): Docs to add.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/docbin#init
|
DOCS: https://nightly.spacy.io/api/docbin#init
|
||||||
|
@ -106,11 +106,12 @@ class DocBin:
|
||||||
self.strings.add(token.ent_type_)
|
self.strings.add(token.ent_type_)
|
||||||
self.strings.add(token.ent_kb_id_)
|
self.strings.add(token.ent_kb_id_)
|
||||||
self.cats.append(doc.cats)
|
self.cats.append(doc.cats)
|
||||||
if self.store_user_data:
|
self.user_data.append(srsly.msgpack_dumps(doc.user_data))
|
||||||
self.user_data.append(srsly.msgpack_dumps(doc.user_data))
|
|
||||||
|
|
||||||
def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
|
def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
|
||||||
"""Recover Doc objects from the annotations, using the given vocab.
|
"""Recover Doc objects from the annotations, using the given vocab.
|
||||||
|
Note that the user data of each doc will be read (if available) and returned,
|
||||||
|
regardless of the setting of 'self.store_user_data'.
|
||||||
|
|
||||||
vocab (Vocab): The shared vocab.
|
vocab (Vocab): The shared vocab.
|
||||||
YIELDS (Doc): The Doc objects.
|
YIELDS (Doc): The Doc objects.
|
||||||
|
@ -129,7 +130,7 @@ class DocBin:
|
||||||
doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)
|
doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)
|
||||||
doc = doc.from_array(self.attrs, tokens)
|
doc = doc.from_array(self.attrs, tokens)
|
||||||
doc.cats = self.cats[i]
|
doc.cats = self.cats[i]
|
||||||
if self.store_user_data:
|
if i < len(self.user_data) and self.user_data[i] is not None:
|
||||||
user_data = srsly.msgpack_loads(self.user_data[i], use_list=False)
|
user_data = srsly.msgpack_loads(self.user_data[i], use_list=False)
|
||||||
doc.user_data.update(user_data)
|
doc.user_data.update(user_data)
|
||||||
yield doc
|
yield doc
|
||||||
|
@ -137,21 +138,31 @@ class DocBin:
|
||||||
def merge(self, other: "DocBin") -> None:
|
def merge(self, other: "DocBin") -> None:
|
||||||
"""Extend the annotations of this DocBin with the annotations from
|
"""Extend the annotations of this DocBin with the annotations from
|
||||||
another. Will raise an error if the pre-defined attrs of the two
|
another. Will raise an error if the pre-defined attrs of the two
|
||||||
DocBins don't match.
|
DocBins don't match, or if they differ in whether or not to store
|
||||||
|
user data.
|
||||||
|
|
||||||
other (DocBin): The DocBin to merge into the current bin.
|
other (DocBin): The DocBin to merge into the current bin.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/docbin#merge
|
DOCS: https://nightly.spacy.io/api/docbin#merge
|
||||||
"""
|
"""
|
||||||
if self.attrs != other.attrs:
|
if self.attrs != other.attrs:
|
||||||
raise ValueError(Errors.E166.format(current=self.attrs, other=other.attrs))
|
raise ValueError(
|
||||||
|
Errors.E166.format(param="attrs", current=self.attrs, other=other.attrs)
|
||||||
|
)
|
||||||
|
if self.store_user_data != other.store_user_data:
|
||||||
|
raise ValueError(
|
||||||
|
Errors.E166.format(
|
||||||
|
param="store_user_data",
|
||||||
|
current=self.store_user_data,
|
||||||
|
other=other.store_user_data,
|
||||||
|
)
|
||||||
|
)
|
||||||
self.tokens.extend(other.tokens)
|
self.tokens.extend(other.tokens)
|
||||||
self.spaces.extend(other.spaces)
|
self.spaces.extend(other.spaces)
|
||||||
self.strings.update(other.strings)
|
self.strings.update(other.strings)
|
||||||
self.cats.extend(other.cats)
|
self.cats.extend(other.cats)
|
||||||
self.flags.extend(other.flags)
|
self.flags.extend(other.flags)
|
||||||
if self.store_user_data:
|
self.user_data.extend(other.user_data)
|
||||||
self.user_data.extend(other.user_data)
|
|
||||||
|
|
||||||
def to_bytes(self) -> bytes:
|
def to_bytes(self) -> bytes:
|
||||||
"""Serialize the DocBin's annotations to a bytestring.
|
"""Serialize the DocBin's annotations to a bytestring.
|
||||||
|
@ -200,8 +211,10 @@ class DocBin:
|
||||||
self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
|
self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
|
||||||
self.cats = msg["cats"]
|
self.cats = msg["cats"]
|
||||||
self.flags = msg.get("flags", [{} for _ in lengths])
|
self.flags = msg.get("flags", [{} for _ in lengths])
|
||||||
if self.store_user_data and "user_data" in msg:
|
if "user_data" in msg:
|
||||||
self.user_data = list(msg["user_data"])
|
self.user_data = list(msg["user_data"])
|
||||||
|
else:
|
||||||
|
self.user_data = [None] * len(self)
|
||||||
for tokens in self.tokens:
|
for tokens in self.tokens:
|
||||||
assert len(tokens.shape) == 2, tokens.shape # this should never happen
|
assert len(tokens.shape) == 2, tokens.shape # this should never happen
|
||||||
return self
|
return self
|
||||||
|
|
|
@ -223,8 +223,10 @@ cdef class Token:
|
||||||
|
|
||||||
def set_morph(self, features):
|
def set_morph(self, features):
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
if features is 0:
|
if features is None:
|
||||||
self.c.morph = 0
|
self.c.morph = 0
|
||||||
|
elif isinstance(features, MorphAnalysis):
|
||||||
|
self.morph = features
|
||||||
else:
|
else:
|
||||||
if isinstance(features, int):
|
if isinstance(features, int):
|
||||||
features = self.vocab.strings[features]
|
features = self.vocab.strings[features]
|
||||||
|
|
|
@ -1,27 +1,43 @@
|
||||||
from typing import Callable, Iterator, Dict, List, Tuple, Optional, TYPE_CHECKING
|
from typing import Callable, Iterator, Dict, List, Tuple, TYPE_CHECKING
|
||||||
import random
|
import random
|
||||||
import itertools
|
import itertools
|
||||||
import copy
|
import copy
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
from pydantic import BaseModel, StrictStr
|
||||||
|
|
||||||
from ..util import registry, logger
|
from ..util import registry, logger
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from .example import Example
|
from .example import Example
|
||||||
from ..lookups import Lookups
|
|
||||||
from ..errors import Errors
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from ..language import Language # noqa: F401
|
from ..language import Language # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
|
class OrthVariantsSingle(BaseModel):
|
||||||
|
tags: List[StrictStr]
|
||||||
|
variants: List[StrictStr]
|
||||||
|
|
||||||
|
|
||||||
|
class OrthVariantsPaired(BaseModel):
|
||||||
|
tags: List[StrictStr]
|
||||||
|
variants: List[List[StrictStr]]
|
||||||
|
|
||||||
|
|
||||||
|
class OrthVariants(BaseModel):
|
||||||
|
paired: List[OrthVariantsPaired] = {}
|
||||||
|
single: List[OrthVariantsSingle] = {}
|
||||||
|
|
||||||
|
|
||||||
@registry.augmenters("spacy.orth_variants.v1")
|
@registry.augmenters("spacy.orth_variants.v1")
|
||||||
def create_orth_variants_augmenter(
|
def create_orth_variants_augmenter(
|
||||||
level: float, lower: float, lookups: Optional[Lookups] = None,
|
level: float, lower: float, orth_variants: OrthVariants,
|
||||||
) -> Callable[["Language", Example], Iterator[Example]]:
|
) -> Callable[["Language", Example], Iterator[Example]]:
|
||||||
"""Create a data augmentation callback that uses orth-variant replacement.
|
"""Create a data augmentation callback that uses orth-variant replacement.
|
||||||
The callback can be added to a corpus or other data iterator during training.
|
The callback can be added to a corpus or other data iterator during training.
|
||||||
"""
|
"""
|
||||||
return partial(orth_variants_augmenter, level=level, lower=lower, lookups=lookups)
|
return partial(
|
||||||
|
orth_variants_augmenter, orth_variants=orth_variants, level=level, lower=lower
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
|
def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
|
||||||
|
@ -31,20 +47,11 @@ def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
|
||||||
def orth_variants_augmenter(
|
def orth_variants_augmenter(
|
||||||
nlp: "Language",
|
nlp: "Language",
|
||||||
example: Example,
|
example: Example,
|
||||||
|
orth_variants: dict,
|
||||||
*,
|
*,
|
||||||
level: float = 0.0,
|
level: float = 0.0,
|
||||||
lower: float = 0.0,
|
lower: float = 0.0,
|
||||||
lookups: Optional[Lookups] = None,
|
|
||||||
) -> Iterator[Example]:
|
) -> Iterator[Example]:
|
||||||
table_name = "orth_variants"
|
|
||||||
if lookups is not None:
|
|
||||||
orth_variants = lookups.get_table(table_name, {})
|
|
||||||
logger.debug("Using data augmentation orth variants from provided lookups")
|
|
||||||
else:
|
|
||||||
orth_variants = nlp.vocab.lookups.get_table(table_name, {})
|
|
||||||
logger.debug("Using data augmentation orth variants from default vocab lookups")
|
|
||||||
if not orth_variants:
|
|
||||||
raise ValueError(Errors.E912.format(lang=nlp.lang))
|
|
||||||
if random.random() >= level:
|
if random.random() >= level:
|
||||||
yield example
|
yield example
|
||||||
else:
|
else:
|
||||||
|
@ -74,13 +81,14 @@ def make_orth_variants(
|
||||||
nlp: "Language",
|
nlp: "Language",
|
||||||
raw: str,
|
raw: str,
|
||||||
token_dict: Dict[str, List[str]],
|
token_dict: Dict[str, List[str]],
|
||||||
orth_variants: Dict[str, list],
|
orth_variants: Dict[str, List[Dict[str, List[str]]]],
|
||||||
*,
|
*,
|
||||||
lower: bool = False,
|
lower: bool = False,
|
||||||
) -> Tuple[str, Dict[str, List[str]]]:
|
) -> Tuple[str, Dict[str, List[str]]]:
|
||||||
orig_token_dict = copy.deepcopy(token_dict)
|
orig_token_dict = copy.deepcopy(token_dict)
|
||||||
ndsv = orth_variants.get("single", [])
|
ndsv = orth_variants.get("single", [])
|
||||||
ndpv = orth_variants.get("paired", [])
|
ndpv = orth_variants.get("paired", [])
|
||||||
|
logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants")
|
||||||
words = token_dict.get("words", [])
|
words = token_dict.get("words", [])
|
||||||
tags = token_dict.get("tags", [])
|
tags = token_dict.get("tags", [])
|
||||||
# keep unmodified if words or tags are not defined
|
# keep unmodified if words or tags are not defined
|
||||||
|
|
|
@ -38,11 +38,11 @@ def create_docbin_reader(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@util.registry.readers("spacy.JsonlReader.v1")
|
@util.registry.readers("spacy.JsonlCorpus.v1")
|
||||||
def create_jsonl_reader(
|
def create_jsonl_reader(
|
||||||
path: Path, min_length: int = 0, max_length: int = 0, limit: int = 0
|
path: Path, min_length: int = 0, max_length: int = 0, limit: int = 0
|
||||||
) -> Callable[["Language"], Iterable[Doc]]:
|
) -> Callable[["Language"], Iterable[Doc]]:
|
||||||
return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit)
|
return JsonlCorpus(path, min_length=min_length, max_length=max_length, limit=limit)
|
||||||
|
|
||||||
|
|
||||||
@util.registry.readers("spacy.read_labels.v1")
|
@util.registry.readers("spacy.read_labels.v1")
|
||||||
|
@ -193,7 +193,7 @@ class Corpus:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
class JsonlTexts:
|
class JsonlCorpus:
|
||||||
"""Iterate Doc objects from a file or directory of jsonl
|
"""Iterate Doc objects from a file or directory of jsonl
|
||||||
formatted raw text files.
|
formatted raw text files.
|
||||||
|
|
||||||
|
@ -206,7 +206,7 @@ class JsonlTexts:
|
||||||
limit (int): Limit corpus to a subset of examples, e.g. for debugging.
|
limit (int): Limit corpus to a subset of examples, e.g. for debugging.
|
||||||
Defaults to 0, which indicates no limit.
|
Defaults to 0, which indicates no limit.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/corpus#jsonltexts
|
DOCS: https://nightly.spacy.io/api/corpus#jsonlcorpus
|
||||||
"""
|
"""
|
||||||
|
|
||||||
file_type = "jsonl"
|
file_type = "jsonl"
|
||||||
|
@ -230,7 +230,7 @@ class JsonlTexts:
|
||||||
nlp (Language): The current nlp object.
|
nlp (Language): The current nlp object.
|
||||||
YIELDS (Example): The example objects.
|
YIELDS (Example): The example objects.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/corpus#jsonltexts-call
|
DOCS: https://nightly.spacy.io/api/corpus#jsonlcorpus-call
|
||||||
"""
|
"""
|
||||||
for loc in walk_corpus(self.path, ".jsonl"):
|
for loc in walk_corpus(self.path, ".jsonl"):
|
||||||
records = srsly.read_jsonl(loc)
|
records = srsly.read_jsonl(loc)
|
||||||
|
|
|
@ -103,10 +103,6 @@ class registry(thinc.registry):
|
||||||
cli = catalogue.create("spacy", "cli", entry_points=True)
|
cli = catalogue.create("spacy", "cli", entry_points=True)
|
||||||
|
|
||||||
|
|
||||||
# We want json loading in the registry, so manually register srsly.read_json.
|
|
||||||
registry.readers("srsly.read_json.v0", srsly.read_json)
|
|
||||||
|
|
||||||
|
|
||||||
class SimpleFrozenDict(dict):
|
class SimpleFrozenDict(dict):
|
||||||
"""Simplified implementation of a frozen dict, mainly used as default
|
"""Simplified implementation of a frozen dict, mainly used as default
|
||||||
function or method argument (for arguments that should default to empty
|
function or method argument (for arguments that should default to empty
|
||||||
|
|
|
@ -100,7 +100,7 @@ Yield examples from the data.
|
||||||
| `nlp` | The current `nlp` object. ~~Language~~ |
|
| `nlp` | The current `nlp` object. ~~Language~~ |
|
||||||
| **YIELDS** | The examples. ~~Example~~ |
|
| **YIELDS** | The examples. ~~Example~~ |
|
||||||
|
|
||||||
## JsonlTexts {#jsonltexts tag="class"}
|
## JsonlCorpus {#jsonlcorpus tag="class"}
|
||||||
|
|
||||||
Iterate Doc objects from a file or directory of JSONL (newline-delimited JSON)
|
Iterate Doc objects from a file or directory of JSONL (newline-delimited JSON)
|
||||||
formatted raw text files. Can be used to read the raw text corpus for language
|
formatted raw text files. Can be used to read the raw text corpus for language
|
||||||
|
@ -126,22 +126,22 @@ file.
|
||||||
{"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."}
|
{"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."}
|
||||||
```
|
```
|
||||||
|
|
||||||
### JsonlTexts.\_\init\_\_ {#jsonltexts-init tag="method"}
|
### JsonlCorpus.\_\init\_\_ {#jsonlcorpus tag="method"}
|
||||||
|
|
||||||
Initialize the reader.
|
Initialize the reader.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.training import JsonlTexts
|
> from spacy.training import JsonlCorpus
|
||||||
>
|
>
|
||||||
> corpus = JsonlTexts("./data/texts.jsonl")
|
> corpus = JsonlCorpus("./data/texts.jsonl")
|
||||||
> ```
|
> ```
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> ### Example config
|
> ### Example config
|
||||||
> [corpora.pretrain]
|
> [corpora.pretrain]
|
||||||
> @readers = "spacy.JsonlReader.v1"
|
> @readers = "spacy.JsonlCorpus.v1"
|
||||||
> path = "corpus/raw_text.jsonl"
|
> path = "corpus/raw_text.jsonl"
|
||||||
> min_length = 0
|
> min_length = 0
|
||||||
> max_length = 0
|
> max_length = 0
|
||||||
|
@ -156,17 +156,17 @@ Initialize the reader.
|
||||||
| `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ |
|
| `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ |
|
||||||
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
||||||
|
|
||||||
### JsonlTexts.\_\_call\_\_ {#jsonltexts-call tag="method"}
|
### JsonlCorpus.\_\_call\_\_ {#jsonlcorpus-call tag="method"}
|
||||||
|
|
||||||
Yield examples from the data.
|
Yield examples from the data.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.training import JsonlTexts
|
> from spacy.training import JsonlCorpus
|
||||||
> import spacy
|
> import spacy
|
||||||
>
|
>
|
||||||
> corpus = JsonlTexts("./texts.jsonl")
|
> corpus = JsonlCorpus("./texts.jsonl")
|
||||||
> nlp = spacy.blank("en")
|
> nlp = spacy.blank("en")
|
||||||
> data = corpus(nlp)
|
> data = corpus(nlp)
|
||||||
> ```
|
> ```
|
||||||
|
|
|
@ -135,7 +135,7 @@ $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy
|
||||||
> path = ${paths:dev}
|
> path = ${paths:dev}
|
||||||
>
|
>
|
||||||
> [corpora.pretrain]
|
> [corpora.pretrain]
|
||||||
> @readers = "spacy.JsonlReader.v1"
|
> @readers = "spacy.JsonlCorpus.v1"
|
||||||
> path = ${paths.raw}
|
> path = ${paths.raw}
|
||||||
>
|
>
|
||||||
> [corpora.my_custom_data]
|
> [corpora.my_custom_data]
|
||||||
|
@ -146,7 +146,7 @@ This section defines a **dictionary** mapping of string keys to functions. Each
|
||||||
function takes an `nlp` object and yields [`Example`](/api/example) objects. By
|
function takes an `nlp` object and yields [`Example`](/api/example) objects. By
|
||||||
default, the two keys `train` and `dev` are specified and each refer to a
|
default, the two keys `train` and `dev` are specified and each refer to a
|
||||||
[`Corpus`](/api/top-level#Corpus). When pretraining, an additional `pretrain`
|
[`Corpus`](/api/top-level#Corpus). When pretraining, an additional `pretrain`
|
||||||
section is added that defaults to a [`JsonlReader`](/api/top-level#JsonlReader).
|
section is added that defaults to a [`JsonlCorpus`](/api/top-level#JsonlCorpus).
|
||||||
You can also register custom functions that return a callable.
|
You can also register custom functions that return a callable.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
|
|
@ -47,7 +47,7 @@ Create a `DocBin` object to hold serialized annotations.
|
||||||
| Argument | Description |
|
| Argument | Description |
|
||||||
| ----------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `attrs` | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. ~~Iterable[str]~~ |
|
| `attrs` | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. ~~Iterable[str]~~ |
|
||||||
| `store_user_data` | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. ~~bool~~ |
|
| `store_user_data` | Whether to write the `Doc.user_data` and the values of custom extension attributes to file/bytes. Defaults to `False`. ~~bool~~ |
|
||||||
| `docs` | `Doc` objects to add on initialization. ~~Iterable[Doc]~~ |
|
| `docs` | `Doc` objects to add on initialization. ~~Iterable[Doc]~~ |
|
||||||
|
|
||||||
## DocBin.\_\len\_\_ {#len tag="method"}
|
## DocBin.\_\len\_\_ {#len tag="method"}
|
||||||
|
|
|
@ -8,8 +8,8 @@ source: spacy/language.py
|
||||||
Usually you'll load this once per process as `nlp` and pass the instance around
|
Usually you'll load this once per process as `nlp` and pass the instance around
|
||||||
your application. The `Language` class is created when you call
|
your application. The `Language` class is created when you call
|
||||||
[`spacy.load`](/api/top-level#spacy.load) and contains the shared vocabulary and
|
[`spacy.load`](/api/top-level#spacy.load) and contains the shared vocabulary and
|
||||||
[language data](/usage/adding-languages), optional binary weights, e.g. provided
|
[language data](/usage/linguistic-features#language-data), optional binary
|
||||||
by a [trained pipeline](/models), and the
|
weights, e.g. provided by a [trained pipeline](/models), and the
|
||||||
[processing pipeline](/usage/processing-pipelines) containing components like
|
[processing pipeline](/usage/processing-pipelines) containing components like
|
||||||
the tagger or parser that are called on a document in order. You can also add
|
the tagger or parser that are called on a document in order. You can also add
|
||||||
your own processing pipeline components that take a `Doc` object, modify it and
|
your own processing pipeline components that take a `Doc` object, modify it and
|
||||||
|
@ -210,7 +210,9 @@ settings defined in the [`[initialize]`](/api/data-formats#config-initialize)
|
||||||
config block to set up the vocabulary, load in vectors and tok2vec weights and
|
config block to set up the vocabulary, load in vectors and tok2vec weights and
|
||||||
pass optional arguments to the `initialize` methods implemented by pipeline
|
pass optional arguments to the `initialize` methods implemented by pipeline
|
||||||
components or the tokenizer. This method is typically called automatically when
|
components or the tokenizer. This method is typically called automatically when
|
||||||
you run [`spacy train`](/api/cli#train).
|
you run [`spacy train`](/api/cli#train). See the usage guide on the
|
||||||
|
[config lifecycle](/usage/training#config-lifecycle) and
|
||||||
|
[initialization](/usage/training#initialization) for details.
|
||||||
|
|
||||||
`get_examples` should be a function that returns an iterable of
|
`get_examples` should be a function that returns an iterable of
|
||||||
[`Example`](/api/example) objects. The data examples can either be the full
|
[`Example`](/api/example) objects. The data examples can either be the full
|
||||||
|
@ -928,7 +930,7 @@ Serialize the current state to a binary string.
|
||||||
|
|
||||||
Load state from a binary string. Note that this method is commonly used via the
|
Load state from a binary string. Note that this method is commonly used via the
|
||||||
subclasses like `English` or `German` to make language-specific functionality
|
subclasses like `English` or `German` to make language-specific functionality
|
||||||
like the [lexical attribute getters](/usage/adding-languages#lex-attrs)
|
like the [lexical attribute getters](/usage/linguistic-features#language-data)
|
||||||
available to the loaded object.
|
available to the loaded object.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
|
|
@ -48,12 +48,11 @@ data format used by the lookup and rule-based lemmatizers, see
|
||||||
> nlp.add_pipe("lemmatizer", config=config)
|
> nlp.add_pipe("lemmatizer", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Setting | Description |
|
| Setting | Description |
|
||||||
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------- | --------------------------------------------------------------------------------- |
|
||||||
| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
|
| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
|
||||||
| `lookups` | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ |
|
| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ |
|
||||||
| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ |
|
| `model` | **Not yet implemented:** the model to use. ~~Model~~ |
|
||||||
| `model` | **Not yet implemented:** the model to use. ~~Model~~ |
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
%%GITHUB_SPACY/spacy/pipeline/lemmatizer.py
|
%%GITHUB_SPACY/spacy/pipeline/lemmatizer.py
|
||||||
|
@ -76,15 +75,14 @@ Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | --------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||||
| `model` | **Not yet implemented:** The model to use. ~~Model~~ |
|
| `model` | **Not yet implemented:** The model to use. ~~Model~~ |
|
||||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| mode | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
|
| mode | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
|
||||||
| lookups | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. ~~Optional[Lookups]~~ |
|
| overwrite | Whether to overwrite existing lemmas. ~~bool~ |
|
||||||
| overwrite | Whether to overwrite existing lemmas. ~~bool~ |
|
|
||||||
|
|
||||||
## Lemmatizer.\_\_call\_\_ {#call tag="method"}
|
## Lemmatizer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
@ -127,11 +125,41 @@ applied to the `Doc` in order.
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
|
## Lemmatizer.initialize {#initialize tag="method"}
|
||||||
|
|
||||||
|
Initialize the lemmatizer and load any data resources. This method is typically
|
||||||
|
called by [`Language.initialize`](/api/language#initialize) and lets you
|
||||||
|
customize arguments it receives via the
|
||||||
|
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||||
|
config. The loading only happens during initialization, typically before
|
||||||
|
training. At runtime, all data is loaded from disk.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> lemmatizer = nlp.add_pipe("lemmatizer")
|
||||||
|
> lemmatizer.initialize(lookups=lookups)
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> ### config.cfg
|
||||||
|
> [initialize.components.lemmatizer]
|
||||||
|
>
|
||||||
|
> [initialize.components.lemmatizer.lookups]
|
||||||
|
> @misc = "load_my_lookups.v1"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Defaults to `None`. ~~Optional[Callable[[], Iterable[Example]]]~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
|
| `lookups` | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ |
|
||||||
|
|
||||||
## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"}
|
## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"}
|
||||||
|
|
||||||
Lemmatize a token using a lookup-based approach. If no lemma is found, the
|
Lemmatize a token using a lookup-based approach. If no lemma is found, the
|
||||||
original string is returned. Languages can provide a
|
original string is returned.
|
||||||
[lookup table](/usage/adding-languages#lemmatizer) via the `Lookups`.
|
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | --------------------------------------------------- |
|
| ----------- | --------------------------------------------------- |
|
||||||
|
|
|
@ -172,6 +172,25 @@ Get a neighboring token.
|
||||||
| `i` | The relative position of the token to get. Defaults to `1`. ~~int~~ |
|
| `i` | The relative position of the token to get. Defaults to `1`. ~~int~~ |
|
||||||
| **RETURNS** | The token at position `self.doc[self.i+i]`. ~~Token~~ |
|
| **RETURNS** | The token at position `self.doc[self.i+i]`. ~~Token~~ |
|
||||||
|
|
||||||
|
## Token.set_morph {#set_morph tag="method"}
|
||||||
|
|
||||||
|
Set the morphological analysis from a UD FEATS string, hash value of a UD FEATS
|
||||||
|
string, features dict or `MorphAnalysis`. The value `None` can be used to reset
|
||||||
|
the morph to an unset state.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("Give it back! He pleaded.")
|
||||||
|
> doc[0].set_morph("Mood=Imp|VerbForm=Fin")
|
||||||
|
> assert "Mood=Imp" in doc[0].morph
|
||||||
|
> assert doc[0].morph.get("Mood") == ["Imp"]
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------- | --------------------------------------------------------------------------------- |
|
||||||
|
| features | The morphological features to set. ~~Union[int, dict, str, MorphAnalysis, None]~~ |
|
||||||
|
|
||||||
## Token.is_ancestor {#is_ancestor tag="method" model="parser"}
|
## Token.is_ancestor {#is_ancestor tag="method" model="parser"}
|
||||||
|
|
||||||
Check whether this token is a parent, grandparent, etc. of another in the
|
Check whether this token is a parent, grandparent, etc. of another in the
|
||||||
|
@ -392,74 +411,73 @@ The L2 norm of the token's vector representation.
|
||||||
|
|
||||||
## Attributes {#attributes}
|
## Attributes {#attributes}
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `doc` | The parent document. ~~Doc~~ |
|
| `doc` | The parent document. ~~Doc~~ |
|
||||||
| `lex` <Tag variant="new">3</Tag> | The underlying lexeme. ~~Lexeme~~ |
|
| `lex` <Tag variant="new">3</Tag> | The underlying lexeme. ~~Lexeme~~ |
|
||||||
| `sent` <Tag variant="new">2.0.12</Tag> | The sentence span that this token is a part of. ~~Span~~ |
|
| `sent` <Tag variant="new">2.0.12</Tag> | The sentence span that this token is a part of. ~~Span~~ |
|
||||||
| `text` | Verbatim text content. ~~str~~ |
|
| `text` | Verbatim text content. ~~str~~ |
|
||||||
| `text_with_ws` | Text content, with trailing space character if present. ~~str~~ |
|
| `text_with_ws` | Text content, with trailing space character if present. ~~str~~ |
|
||||||
| `whitespace_` | Trailing space character if present. ~~str~~ |
|
| `whitespace_` | Trailing space character if present. ~~str~~ |
|
||||||
| `orth` | ID of the verbatim text content. ~~int~~ |
|
| `orth` | ID of the verbatim text content. ~~int~~ |
|
||||||
| `orth_` | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. ~~str~~ |
|
| `orth_` | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. ~~str~~ |
|
||||||
| `vocab` | The vocab object of the parent `Doc`. ~~vocab~~ |
|
| `vocab` | The vocab object of the parent `Doc`. ~~vocab~~ |
|
||||||
| `tensor` <Tag variant="new">2.1.7</Tag> | The tokens's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~ |
|
| `tensor` <Tag variant="new">2.1.7</Tag> | The tokens's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~ |
|
||||||
| `head` | The syntactic parent, or "governor", of this token. ~~Token~~ |
|
| `head` | The syntactic parent, or "governor", of this token. ~~Token~~ |
|
||||||
| `left_edge` | The leftmost token of this token's syntactic descendants. ~~Token~~ |
|
| `left_edge` | The leftmost token of this token's syntactic descendants. ~~Token~~ |
|
||||||
| `right_edge` | The rightmost token of this token's syntactic descendants. ~~Token~~ |
|
| `right_edge` | The rightmost token of this token's syntactic descendants. ~~Token~~ |
|
||||||
| `i` | The index of the token within the parent document. ~~int~~ |
|
| `i` | The index of the token within the parent document. ~~int~~ |
|
||||||
| `ent_type` | Named entity type. ~~int~~ |
|
| `ent_type` | Named entity type. ~~int~~ |
|
||||||
| `ent_type_` | Named entity type. ~~str~~ |
|
| `ent_type_` | Named entity type. ~~str~~ |
|
||||||
| `ent_iob` | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. ~~int~~ |
|
| `ent_iob` | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. ~~int~~ |
|
||||||
| `ent_iob_` | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~ |
|
| `ent_iob_` | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~ |
|
||||||
| `ent_kb_id` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~ |
|
| `ent_kb_id` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~ |
|
||||||
| `ent_kb_id_` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~ |
|
| `ent_kb_id_` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~ |
|
||||||
| `ent_id` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~ |
|
| `ent_id` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~ |
|
||||||
| `ent_id_` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~ |
|
| `ent_id_` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~ |
|
||||||
| `lemma` | Base form of the token, with no inflectional suffixes. ~~int~~ |
|
| `lemma` | Base form of the token, with no inflectional suffixes. ~~int~~ |
|
||||||
| `lemma_` | Base form of the token, with no inflectional suffixes. ~~str~~ |
|
| `lemma_` | Base form of the token, with no inflectional suffixes. ~~str~~ |
|
||||||
| `norm` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~int~~ |
|
| `norm` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~ |
|
||||||
| `norm_` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~str~~ |
|
| `norm_` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~str~~ |
|
||||||
| `lower` | Lowercase form of the token. ~~int~~ |
|
| `lower` | Lowercase form of the token. ~~int~~ |
|
||||||
| `lower_` | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~ |
|
| `lower_` | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~ |
|
||||||
| `shape` | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
|
| `shape` | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
|
||||||
| `shape_` | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
|
| `shape_` | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
|
||||||
| `prefix` | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~ |
|
| `prefix` | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~ |
|
||||||
| `prefix_` | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~ |
|
| `prefix_` | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~ |
|
||||||
| `suffix` | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~ |
|
| `suffix` | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~ |
|
||||||
| `suffix_` | Length-N substring from the end of the token. Defaults to `N=3`. ~~str~~ |
|
| `suffix_` | Length-N substring from the end of the token. Defaults to `N=3`. ~~str~~ |
|
||||||
| `is_alpha` | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. ~~bool~~ |
|
| `is_alpha` | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. ~~bool~~ |
|
||||||
| `is_ascii` | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. ~~bool~~ |
|
| `is_ascii` | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. ~~bool~~ |
|
||||||
| `is_digit` | Does the token consist of digits? Equivalent to `token.text.isdigit()`. ~~bool~~ |
|
| `is_digit` | Does the token consist of digits? Equivalent to `token.text.isdigit()`. ~~bool~~ |
|
||||||
| `is_lower` | Is the token in lowercase? Equivalent to `token.text.islower()`. ~~bool~~ |
|
| `is_lower` | Is the token in lowercase? Equivalent to `token.text.islower()`. ~~bool~~ |
|
||||||
| `is_upper` | Is the token in uppercase? Equivalent to `token.text.isupper()`. ~~bool~~ |
|
| `is_upper` | Is the token in uppercase? Equivalent to `token.text.isupper()`. ~~bool~~ |
|
||||||
| `is_title` | Is the token in titlecase? Equivalent to `token.text.istitle()`. ~~bool~~ |
|
| `is_title` | Is the token in titlecase? Equivalent to `token.text.istitle()`. ~~bool~~ |
|
||||||
| `is_punct` | Is the token punctuation? ~~bool~~ |
|
| `is_punct` | Is the token punctuation? ~~bool~~ |
|
||||||
| `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ |
|
| `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ |
|
||||||
| `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ |
|
| `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ |
|
||||||
| `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ |
|
| `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ |
|
||||||
| `is_bracket` | Is the token a bracket? ~~bool~~ |
|
| `is_bracket` | Is the token a bracket? ~~bool~~ |
|
||||||
| `is_quote` | Is the token a quotation mark? ~~bool~~ |
|
| `is_quote` | Is the token a quotation mark? ~~bool~~ |
|
||||||
| `is_currency` <Tag variant="new">2.0.8</Tag> | Is the token a currency symbol? ~~bool~~ |
|
| `is_currency` <Tag variant="new">2.0.8</Tag> | Is the token a currency symbol? ~~bool~~ |
|
||||||
| `like_url` | Does the token resemble a URL? ~~bool~~ |
|
| `like_url` | Does the token resemble a URL? ~~bool~~ |
|
||||||
| `like_num` | Does the token represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~ |
|
| `like_num` | Does the token represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~ |
|
||||||
| `like_email` | Does the token resemble an email address? ~~bool~~ |
|
| `like_email` | Does the token resemble an email address? ~~bool~~ |
|
||||||
| `is_oov` | Does the token have a word vector? ~~bool~~ |
|
| `is_oov` | Does the token have a word vector? ~~bool~~ |
|
||||||
| `is_stop` | Is the token part of a "stop list"? ~~bool~~ |
|
| `is_stop` | Is the token part of a "stop list"? ~~bool~~ |
|
||||||
| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~int~~ |
|
| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~int~~ |
|
||||||
| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~ |
|
| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~ |
|
||||||
| `tag` | Fine-grained part-of-speech. ~~int~~ |
|
| `tag` | Fine-grained part-of-speech. ~~int~~ |
|
||||||
| `tag_` | Fine-grained part-of-speech. ~~str~~ |
|
| `tag_` | Fine-grained part-of-speech. ~~str~~ |
|
||||||
| `morph` <Tag variant="new">3</Tag> | Morphological analysis. ~~MorphAnalysis~~ |
|
| `morph` <Tag variant="new">3</Tag> | Morphological analysis. ~~MorphAnalysis~~ |
|
||||||
| `morph_` <Tag variant="new">3</Tag> | Morphological analysis in the Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
|
| `dep` | Syntactic dependency relation. ~~int~~ |
|
||||||
| `dep` | Syntactic dependency relation. ~~int~~ |
|
| `dep_` | Syntactic dependency relation. ~~str~~ |
|
||||||
| `dep_` | Syntactic dependency relation. ~~str~~ |
|
| `lang` | Language of the parent document's vocabulary. ~~int~~ |
|
||||||
| `lang` | Language of the parent document's vocabulary. ~~int~~ |
|
| `lang_` | Language of the parent document's vocabulary. ~~str~~ |
|
||||||
| `lang_` | Language of the parent document's vocabulary. ~~str~~ |
|
| `prob` | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~ |
|
||||||
| `prob` | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~ |
|
| `idx` | The character offset of the token within the parent document. ~~int~~ |
|
||||||
| `idx` | The character offset of the token within the parent document. ~~int~~ |
|
| `sentiment` | A scalar value indicating the positivity or negativity of the token. ~~float~~ |
|
||||||
| `sentiment` | A scalar value indicating the positivity or negativity of the token. ~~float~~ |
|
| `lex_id` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ |
|
||||||
| `lex_id` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ |
|
| `rank` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ |
|
||||||
| `rank` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ |
|
| `cluster` | Brown cluster ID. ~~int~~ |
|
||||||
| `cluster` | Brown cluster ID. ~~int~~ |
|
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
|
||||||
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
|
|
||||||
|
|
|
@ -22,9 +22,8 @@ like punctuation and special case rules from the
|
||||||
|
|
||||||
## Tokenizer.\_\_init\_\_ {#init tag="method"}
|
## Tokenizer.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
Create a `Tokenizer` to create `Doc` objects given unicode text. For examples
|
Create a `Tokenizer` to create `Doc` objects given unicode text. For examples of
|
||||||
of how to construct a custom tokenizer with different tokenization rules, see
|
how to construct a custom tokenizer with different tokenization rules, see the
|
||||||
the
|
|
||||||
[usage documentation](https://spacy.io/usage/linguistic-features#native-tokenizers).
|
[usage documentation](https://spacy.io/usage/linguistic-features#native-tokenizers).
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -87,7 +86,7 @@ Tokenize a stream of texts.
|
||||||
| ------------ | ------------------------------------------------------------------------------------ |
|
| ------------ | ------------------------------------------------------------------------------------ |
|
||||||
| `texts` | A sequence of unicode texts. ~~Iterable[str]~~ |
|
| `texts` | A sequence of unicode texts. ~~Iterable[str]~~ |
|
||||||
| `batch_size` | The number of texts to accumulate in an internal buffer. Defaults to `1000`. ~~int~~ |
|
| `batch_size` | The number of texts to accumulate in an internal buffer. Defaults to `1000`. ~~int~~ |
|
||||||
| **YIELDS** | The tokenized `Doc` objects, in order. ~~Doc~~ |
|
| **YIELDS** | The tokenized `Doc` objects, in order. ~~Doc~~ |
|
||||||
|
|
||||||
## Tokenizer.find_infix {#find_infix tag="method"}
|
## Tokenizer.find_infix {#find_infix tag="method"}
|
||||||
|
|
||||||
|
@ -121,10 +120,10 @@ if no suffix rules match.
|
||||||
## Tokenizer.add_special_case {#add_special_case tag="method"}
|
## Tokenizer.add_special_case {#add_special_case tag="method"}
|
||||||
|
|
||||||
Add a special-case tokenization rule. This mechanism is also used to add custom
|
Add a special-case tokenization rule. This mechanism is also used to add custom
|
||||||
tokenizer exceptions to the language data. See the usage guide on
|
tokenizer exceptions to the language data. See the usage guide on the
|
||||||
[adding languages](/usage/adding-languages#tokenizer-exceptions) and
|
[languages data](/usage/linguistic-features#language-data) and
|
||||||
[linguistic features](/usage/linguistic-features#special-cases) for more details
|
[tokenizer special cases](/usage/linguistic-features#special-cases) for more
|
||||||
and examples.
|
details and examples.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
|
|
@ -269,11 +269,11 @@ If a setting is not present in the options, the default value will be used.
|
||||||
> displacy.serve(doc, style="ent", options=options)
|
> displacy.serve(doc, style="ent", options=options)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `ents` | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~ |
|
| `ents` | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~ |
|
||||||
| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ |
|
| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ |
|
||||||
| `template` <Tag variant="new">2.2</Tag> | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ |
|
| `template` <Tag variant="new">2.2</Tag> | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](%%GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ |
|
||||||
|
|
||||||
By default, displaCy comes with colors for all entity types used by
|
By default, displaCy comes with colors for all entity types used by
|
||||||
[spaCy's trained pipelines](/models). If you're using custom entity types, you
|
[spaCy's trained pipelines](/models). If you're using custom entity types, you
|
||||||
|
@ -327,7 +327,7 @@ factories.
|
||||||
| `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). |
|
| `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). |
|
||||||
| `misc` | Registry for miscellaneous functions that return data assets, knowledge bases or anything else you may need. |
|
| `misc` | Registry for miscellaneous functions that return data assets, knowledge bases or anything else you may need. |
|
||||||
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
|
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
|
||||||
| `readers` | Registry for training and evaluation data readers like [`Corpus`](/api/corpus). |
|
| `readers` | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus). |
|
||||||
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
|
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
|
||||||
| `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. |
|
| `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. |
|
||||||
|
|
||||||
|
@ -470,7 +470,65 @@ logging the results.
|
||||||
|
|
||||||
</Project>
|
</Project>
|
||||||
|
|
||||||
## Readers {#readers source="spacy/training/corpus.py" new="3"}
|
## Readers {#readers}
|
||||||
|
|
||||||
|
### File readers {#file-readers source="github.com/explosion/srsly" new="3"}
|
||||||
|
|
||||||
|
The following file readers are provided by our serialization library
|
||||||
|
[`srsly`](https://github.com/explosion/srsly). All registered functions take one
|
||||||
|
argument `path`, pointing to the file path to load.
|
||||||
|
|
||||||
|
> #### Example config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [corpora.train.augmenter.orth_variants]
|
||||||
|
> @readers = "srsly.read_json.v1"
|
||||||
|
> path = "corpus/en_orth_variants.json"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------------------- | ----------------------------------------------------- |
|
||||||
|
| `srsly.read_json.v1` | Read data from a JSON file. |
|
||||||
|
| `srsly.read_jsonl.v1` | Read data from a JSONL (newline-delimited JSON) file. |
|
||||||
|
| `srsly.read_yaml.v1` | Read data from a YAML file. |
|
||||||
|
| `srsly.read_msgpack.v1` | Read data from a binary MessagePack file. |
|
||||||
|
|
||||||
|
<Infobox title="Important note" variant="warning">
|
||||||
|
|
||||||
|
Since the file readers expect a local path, you should only use them in config
|
||||||
|
blocks that are **not executed at runtime** – for example, in `[training]` and
|
||||||
|
`[corpora]` (to load data or resources like data augmentation tables) or in
|
||||||
|
`[initialize]` (to pass data to pipeline components).
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
#### spacy.read_labels.v1 {#read_labels tag="registered function"}
|
||||||
|
|
||||||
|
Read a JSON-formatted labels file generated with
|
||||||
|
[`init labels`](/api/cli#init-labels). Typically used in the
|
||||||
|
[`[initialize]`](/api/data-formats#config-initialize) block of the training
|
||||||
|
config to speed up the model initialization process and provide pre-generated
|
||||||
|
label sets.
|
||||||
|
|
||||||
|
> #### Example config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [initialize.components]
|
||||||
|
>
|
||||||
|
> [initialize.components.ner]
|
||||||
|
>
|
||||||
|
> [initialize.components.ner.labels]
|
||||||
|
> @readers = "spacy.read_labels.v1"
|
||||||
|
> path = "corpus/labels/ner.json"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `path` | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~ |
|
||||||
|
| `require` | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ |
|
||||||
|
| **CREATES** | The |
|
||||||
|
|
||||||
|
### Corpus readers {#corpus-readers source="spacy/training/corpus.py" new="3"}
|
||||||
|
|
||||||
Corpus readers are registered functions that load data and return a function
|
Corpus readers are registered functions that load data and return a function
|
||||||
that takes the current `nlp` object and yields [`Example`](/api/example) objects
|
that takes the current `nlp` object and yields [`Example`](/api/example) objects
|
||||||
|
@ -480,7 +538,7 @@ with your own registered function in the
|
||||||
[`@readers` registry](/api/top-level#registry) to customize the data loading and
|
[`@readers` registry](/api/top-level#registry) to customize the data loading and
|
||||||
streaming.
|
streaming.
|
||||||
|
|
||||||
### spacy.Corpus.v1 {#corpus tag="registered function"}
|
#### spacy.Corpus.v1 {#corpus tag="registered function"}
|
||||||
|
|
||||||
The `Corpus` reader manages annotated corpora and can be used for training and
|
The `Corpus` reader manages annotated corpora and can be used for training and
|
||||||
development datasets in the [DocBin](/api/docbin) (`.spacy`) format. Also see
|
development datasets in the [DocBin](/api/docbin) (`.spacy`) format. Also see
|
||||||
|
@ -509,12 +567,12 @@ the [`Corpus`](/api/corpus) class.
|
||||||
| `augmenter` | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ |
|
| `augmenter` | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ |
|
||||||
| **CREATES** | The corpus reader. ~~Corpus~~ |
|
| **CREATES** | The corpus reader. ~~Corpus~~ |
|
||||||
|
|
||||||
### spacy.JsonlReader.v1 {#jsonlreader tag="registered function"}
|
#### spacy.JsonlCorpus.v1 {#jsonlcorpus tag="registered function"}
|
||||||
|
|
||||||
Create [`Example`](/api/example) objects from a JSONL (newline-delimited JSON)
|
Create [`Example`](/api/example) objects from a JSONL (newline-delimited JSON)
|
||||||
file of texts keyed by `"text"`. Can be used to read the raw text corpus for
|
file of texts keyed by `"text"`. Can be used to read the raw text corpus for
|
||||||
language model [pretraining](/usage/embeddings-transformers#pretraining) from a
|
language model [pretraining](/usage/embeddings-transformers#pretraining) from a
|
||||||
JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
|
JSONL file. Also see the [`JsonlCorpus`](/api/corpus#jsonlcorpus) class.
|
||||||
|
|
||||||
> #### Example config
|
> #### Example config
|
||||||
>
|
>
|
||||||
|
@ -523,7 +581,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
|
||||||
> pretrain = "corpus/raw_text.jsonl"
|
> pretrain = "corpus/raw_text.jsonl"
|
||||||
>
|
>
|
||||||
> [corpora.pretrain]
|
> [corpora.pretrain]
|
||||||
> @readers = "spacy.JsonlReader.v1"
|
> @readers = "spacy.JsonlCorpus.v1"
|
||||||
> path = ${paths.pretrain}
|
> path = ${paths.pretrain}
|
||||||
> min_length = 0
|
> min_length = 0
|
||||||
> max_length = 0
|
> max_length = 0
|
||||||
|
@ -536,33 +594,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
|
||||||
| `min_length` | Minimum document length (in tokens). Shorter documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ |
|
| `min_length` | Minimum document length (in tokens). Shorter documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ |
|
||||||
| `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ |
|
| `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ |
|
||||||
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
||||||
| **CREATES** | The corpus reader. ~~JsonlTexts~~ |
|
| **CREATES** | The corpus reader. ~~JsonlCorpus~~ |
|
||||||
|
|
||||||
### spacy.read_labels.v1 {#read_labels tag="registered function"}
|
|
||||||
|
|
||||||
Read a JSON-formatted labels file generated with
|
|
||||||
[`init labels`](/api/cli#init-labels). Typically used in the
|
|
||||||
[`[initialize]`](/api/data-formats#config-initialize) block of the training
|
|
||||||
config to speed up the model initialization process and provide pre-generated
|
|
||||||
label sets.
|
|
||||||
|
|
||||||
> #### Example config
|
|
||||||
>
|
|
||||||
> ```ini
|
|
||||||
> [initialize.components]
|
|
||||||
>
|
|
||||||
> [initialize.components.ner]
|
|
||||||
>
|
|
||||||
> [initialize.components.ner.labels]
|
|
||||||
> @readers = "spacy.read_labels.v1"
|
|
||||||
> path = "corpus/labels/ner.json"
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
||||||
| `path` | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~ |
|
|
||||||
| `require` | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ |
|
|
||||||
| **CREATES** | The |
|
|
||||||
|
|
||||||
## Batchers {#batchers source="spacy/training/batchers.py" new="3"}
|
## Batchers {#batchers source="spacy/training/batchers.py" new="3"}
|
||||||
|
|
||||||
|
@ -653,7 +685,11 @@ sequences in the batch.
|
||||||
|
|
||||||
## Augmenters {#augmenters source="spacy/training/augment.py" new="3"}
|
## Augmenters {#augmenters source="spacy/training/augment.py" new="3"}
|
||||||
|
|
||||||
<!-- TODO: intro, explain data augmentation concept -->
|
Data augmentation is the process of applying small modifications to the training
|
||||||
|
data. It can be especially useful for punctuation and case replacement – for
|
||||||
|
example, if your corpus only uses smart quotes and you want to include
|
||||||
|
variations using regular quotes, or to make the model less sensitive to
|
||||||
|
capitalization by including a mix of capitalized and lowercase examples. See the [usage guide](/usage/training#data-augmentation) for details and examples.
|
||||||
|
|
||||||
### spacy.orth_variants.v1 {#orth_variants tag="registered function"}
|
### spacy.orth_variants.v1 {#orth_variants tag="registered function"}
|
||||||
|
|
||||||
|
@ -664,7 +700,10 @@ sequences in the batch.
|
||||||
> @augmenters = "spacy.orth_variants.v1"
|
> @augmenters = "spacy.orth_variants.v1"
|
||||||
> level = 0.1
|
> level = 0.1
|
||||||
> lower = 0.5
|
> lower = 0.5
|
||||||
> lookups = null
|
>
|
||||||
|
> [corpora.train.augmenter.orth_variants]
|
||||||
|
> @readers = "srsly.read_json.v1"
|
||||||
|
> path = "corpus/en_orth_variants.json"
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Create a data augmentation callback that uses orth-variant replacement. The
|
Create a data augmentation callback that uses orth-variant replacement. The
|
||||||
|
@ -672,12 +711,12 @@ callback can be added to a corpus or other data iterator during training. This
|
||||||
is especially useful for punctuation and case replacement, to help generalize
|
is especially useful for punctuation and case replacement, to help generalize
|
||||||
beyond corpora that don't have smart quotes, or only have smart quotes etc.
|
beyond corpora that don't have smart quotes, or only have smart quotes etc.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `level` | The percentage of texts that will be augmented. ~~float~~ |
|
| `level` | The percentage of texts that will be augmented. ~~float~~ |
|
||||||
| `lower` | The percentage of texts that will be lowercased. ~~float~~ |
|
| `lower` | The percentage of texts that will be lowercased. ~~float~~ |
|
||||||
| `lookups` | Lookups table containing the orth variants to use. See [`orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. If not set, tables from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) are used if available and added in the [`[initialize]`](/api/data-formats#config-initialize) block of the config. If no orth variants are found, spaCy will raise an error. Defaults to `None`. ~~Optional[Lookups]~~ |
|
| `orth_variants` | A dictionary containing the single and paired orth variants. Typically loaded from a JSON file. See [`en_orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. ~~Dict[str, Dict[List[Union[str, List[str]]]]]~~ |
|
||||||
| **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ |
|
| **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ |
|
||||||
|
|
||||||
## Training data and alignment {#gold source="spacy/training"}
|
## Training data and alignment {#gold source="spacy/training"}
|
||||||
|
|
||||||
|
@ -788,7 +827,7 @@ utilities.
|
||||||
### util.get_lang_class {#util.get_lang_class tag="function"}
|
### util.get_lang_class {#util.get_lang_class tag="function"}
|
||||||
|
|
||||||
Import and load a `Language` class. Allows lazy-loading
|
Import and load a `Language` class. Allows lazy-loading
|
||||||
[language data](/usage/adding-languages) and importing languages using the
|
[language data](/usage/linguistic-features#language-data) and importing languages using the
|
||||||
two-letter language code. To add a language code for a custom language class,
|
two-letter language code. To add a language code for a custom language class,
|
||||||
you can register it using the [`@registry.languages`](/api/top-level#registry)
|
you can register it using the [`@registry.languages`](/api/top-level#registry)
|
||||||
decorator.
|
decorator.
|
||||||
|
|
|
@ -622,7 +622,7 @@ that are familiar from the training block: the `[pretraining.batcher]`,
|
||||||
`[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and
|
`[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and
|
||||||
expect the same types of objects, although for pretraining your corpus does not
|
expect the same types of objects, although for pretraining your corpus does not
|
||||||
need to have any annotations, so you will often use a different reader, such as
|
need to have any annotations, so you will often use a different reader, such as
|
||||||
the [`JsonlReader`](/api/top-level#jsonlreader).
|
the [`JsonlCorpus`](/api/top-level#jsonlcorpus).
|
||||||
|
|
||||||
> #### Raw text format
|
> #### Raw text format
|
||||||
>
|
>
|
||||||
|
|
|
@ -8,10 +8,7 @@ menu:
|
||||||
- ['Changelog', 'changelog']
|
- ['Changelog', 'changelog']
|
||||||
---
|
---
|
||||||
|
|
||||||
spaCy is compatible with **64-bit CPython 3.6+** and runs on **Unix/Linux**,
|
## Quickstart {hidden="true"}
|
||||||
**macOS/OS X** and **Windows**. The latest spaCy releases are available over
|
|
||||||
[pip](https://pypi.python.org/pypi/spacy) and
|
|
||||||
[conda](https://anaconda.org/conda-forge/spacy).
|
|
||||||
|
|
||||||
> #### 📖 Looking for the old docs?
|
> #### 📖 Looking for the old docs?
|
||||||
>
|
>
|
||||||
|
@ -19,21 +16,22 @@ spaCy is compatible with **64-bit CPython 3.6+** and runs on **Unix/Linux**,
|
||||||
> website to [**v2.spacy.io**](https://v2.spacy.io/docs). To see what's changed
|
> website to [**v2.spacy.io**](https://v2.spacy.io/docs). To see what's changed
|
||||||
> and how to migrate, see the guide on [v3.0 guide](/usage/v3).
|
> and how to migrate, see the guide on [v3.0 guide](/usage/v3).
|
||||||
|
|
||||||
## Quickstart {hidden="true"}
|
|
||||||
|
|
||||||
import QuickstartInstall from 'widgets/quickstart-install.js'
|
import QuickstartInstall from 'widgets/quickstart-install.js'
|
||||||
|
|
||||||
<QuickstartInstall title="Quickstart" id="quickstart" />
|
<QuickstartInstall id="quickstart" />
|
||||||
|
|
||||||
## Installation instructions {#installation}
|
## Installation instructions {#installation}
|
||||||
|
|
||||||
|
spaCy is compatible with **64-bit CPython 3.6+** and runs on **Unix/Linux**,
|
||||||
|
**macOS/OS X** and **Windows**. The latest spaCy releases are available over
|
||||||
|
[pip](https://pypi.python.org/pypi/spacy) and
|
||||||
|
[conda](https://anaconda.org/conda-forge/spacy).
|
||||||
|
|
||||||
### pip {#pip}
|
### pip {#pip}
|
||||||
|
|
||||||
Using pip, spaCy releases are available as source packages and binary wheels.
|
Using pip, spaCy releases are available as source packages and binary wheels.
|
||||||
|
Before you install spaCy and its dependencies, make sure that your `pip`,
|
||||||
```bash
|
`setuptools` and `wheel` are up to date.
|
||||||
$ pip install -U spacy
|
|
||||||
```
|
|
||||||
|
|
||||||
> #### Download pipelines
|
> #### Download pipelines
|
||||||
>
|
>
|
||||||
|
@ -47,16 +45,10 @@ $ pip install -U spacy
|
||||||
> >>> nlp = spacy.load("en_core_web_sm")
|
> >>> nlp = spacy.load("en_core_web_sm")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
<Infobox variant="warning">
|
```bash
|
||||||
|
$ pip install -U pip setuptools wheel
|
||||||
To install additional data tables for lemmatization you can run
|
$ pip install -U spacy
|
||||||
`pip install spacy[lookups]` or install
|
```
|
||||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
|
|
||||||
separately. The lookups package is needed to provide normalization and
|
|
||||||
lemmatization data for new models and to lemmatize in languages that don't yet
|
|
||||||
come with trained pipelines and aren't powered by third-party libraries.
|
|
||||||
|
|
||||||
</Infobox>
|
|
||||||
|
|
||||||
When using pip it is generally recommended to install packages in a virtual
|
When using pip it is generally recommended to install packages in a virtual
|
||||||
environment to avoid modifying system state:
|
environment to avoid modifying system state:
|
||||||
|
@ -64,9 +56,28 @@ environment to avoid modifying system state:
|
||||||
```bash
|
```bash
|
||||||
$ python -m venv .env
|
$ python -m venv .env
|
||||||
$ source .env/bin/activate
|
$ source .env/bin/activate
|
||||||
|
$ pip install -U pip setuptools wheel
|
||||||
$ pip install spacy
|
$ pip install spacy
|
||||||
```
|
```
|
||||||
|
|
||||||
|
spaCy also lets you install extra dependencies by specifying the following
|
||||||
|
keywords in brackets, e.g. `spacy[ja]` or `spacy[lookups,transformers]` (with
|
||||||
|
multiple comma-separated extras). See the `[options.extras_require]` section in
|
||||||
|
spaCy's [`setup.cfg`](%%GITHUB_SPACY/setup.cfg) for details on what's included.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> $ pip install spacy[lookups,transformers]
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `lookups` | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
|
||||||
|
| `transformers` | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline. |
|
||||||
|
| `cuda`, ... | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options. |
|
||||||
|
| `ja`, `ko`, `th` | Install additional dependencies required for tokenization for the [languages](/usage/models#languages). |
|
||||||
|
|
||||||
### conda {#conda}
|
### conda {#conda}
|
||||||
|
|
||||||
Thanks to our great community, we've been able to re-add conda support. You can
|
Thanks to our great community, we've been able to re-add conda support. You can
|
||||||
|
@ -112,10 +123,9 @@ $ python -m spacy validate
|
||||||
### Run spaCy with GPU {#gpu new="2.0.14"}
|
### Run spaCy with GPU {#gpu new="2.0.14"}
|
||||||
|
|
||||||
As of v2.0, spaCy comes with neural network models that are implemented in our
|
As of v2.0, spaCy comes with neural network models that are implemented in our
|
||||||
machine learning library, [Thinc](https://github.com/explosion/thinc). For GPU
|
machine learning library, [Thinc](https://thinc.ai). For GPU support, we've been
|
||||||
support, we've been grateful to use the work of Chainer's
|
grateful to use the work of Chainer's [CuPy](https://cupy.chainer.org) module,
|
||||||
[CuPy](https://cupy.chainer.org) module, which provides a numpy-compatible
|
which provides a numpy-compatible interface for GPU arrays.
|
||||||
interface for GPU arrays.
|
|
||||||
|
|
||||||
spaCy can be installed on GPU by specifying `spacy[cuda]`, `spacy[cuda90]`,
|
spaCy can be installed on GPU by specifying `spacy[cuda]`, `spacy[cuda90]`,
|
||||||
`spacy[cuda91]`, `spacy[cuda92]`, `spacy[cuda100]`, `spacy[cuda101]` or
|
`spacy[cuda91]`, `spacy[cuda92]`, `spacy[cuda100]`, `spacy[cuda101]` or
|
||||||
|
|
|
@ -56,16 +56,13 @@ create a surface form. Here are some examples:
|
||||||
|
|
||||||
Morphological features are stored in the [`MorphAnalysis`](/api/morphanalysis)
|
Morphological features are stored in the [`MorphAnalysis`](/api/morphanalysis)
|
||||||
under `Token.morph`, which allows you to access individual morphological
|
under `Token.morph`, which allows you to access individual morphological
|
||||||
features. The attribute `Token.morph_` provides the morphological analysis in
|
features.
|
||||||
the Universal Dependencies
|
|
||||||
[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
|
|
||||||
format.
|
|
||||||
|
|
||||||
> #### 📝 Things to try
|
> #### 📝 Things to try
|
||||||
>
|
>
|
||||||
> 1. Change "I" to "She". You should see that the morphological features change
|
> 1. Change "I" to "She". You should see that the morphological features change
|
||||||
> and express that it's a pronoun in the third person.
|
> and express that it's a pronoun in the third person.
|
||||||
> 2. Inspect `token.morph_` for the other tokens.
|
> 2. Inspect `token.morph` for the other tokens.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
|
@ -75,7 +72,7 @@ nlp = spacy.load("en_core_web_sm")
|
||||||
print("Pipeline:", nlp.pipe_names)
|
print("Pipeline:", nlp.pipe_names)
|
||||||
doc = nlp("I was reading the paper.")
|
doc = nlp("I was reading the paper.")
|
||||||
token = doc[0] # 'I'
|
token = doc[0] # 'I'
|
||||||
print(token.morph_) # 'Case=Nom|Number=Sing|Person=1|PronType=Prs'
|
print(token.morph) # 'Case=Nom|Number=Sing|Person=1|PronType=Prs'
|
||||||
print(token.morph.get("PronType")) # ['Prs']
|
print(token.morph.get("PronType")) # ['Prs']
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -91,7 +88,7 @@ import spacy
|
||||||
|
|
||||||
nlp = spacy.load("de_core_news_sm")
|
nlp = spacy.load("de_core_news_sm")
|
||||||
doc = nlp("Wo bist du?") # English: 'Where are you?'
|
doc = nlp("Wo bist du?") # English: 'Where are you?'
|
||||||
print(doc[2].morph_) # 'Case=Nom|Number=Sing|Person=2|PronType=Prs'
|
print(doc[2].morph) # 'Case=Nom|Number=Sing|Person=2|PronType=Prs'
|
||||||
print(doc[2].pos_) # 'PRON'
|
print(doc[2].pos_) # 'PRON'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -117,7 +114,7 @@ import spacy
|
||||||
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
doc = nlp("Where are you?")
|
doc = nlp("Where are you?")
|
||||||
print(doc[2].morph_) # 'Case=Nom|Person=2|PronType=Prs'
|
print(doc[2].morph) # 'Case=Nom|Person=2|PronType=Prs'
|
||||||
print(doc[2].pos_) # 'PRON'
|
print(doc[2].pos_) # 'PRON'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -30,7 +30,7 @@ import QuickstartModels from 'widgets/quickstart-models.js'
|
||||||
## Language support {#languages}
|
## Language support {#languages}
|
||||||
|
|
||||||
spaCy currently provides support for the following languages. You can help by
|
spaCy currently provides support for the following languages. You can help by
|
||||||
[improving the existing language data](/usage/adding-languages#language-data)
|
improving the existing [language data](/usage/linguistic-features#language-data)
|
||||||
and extending the tokenization patterns.
|
and extending the tokenization patterns.
|
||||||
[See here](https://github.com/explosion/spaCy/issues/3056) for details on how to
|
[See here](https://github.com/explosion/spaCy/issues/3056) for details on how to
|
||||||
contribute to development.
|
contribute to development.
|
||||||
|
@ -83,74 +83,95 @@ To train a pipeline using the neutral multi-language class, you can set
|
||||||
import the `MultiLanguage` class directly, or call
|
import the `MultiLanguage` class directly, or call
|
||||||
[`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading.
|
[`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading.
|
||||||
|
|
||||||
### Chinese language support {#chinese new=2.3}
|
### Chinese language support {#chinese new="2.3"}
|
||||||
|
|
||||||
The Chinese language class supports three word segmentation options:
|
The Chinese language class supports three word segmentation options, `char`,
|
||||||
|
`jieba` and `pkuseg`.
|
||||||
|
|
||||||
|
> #### Manual setup
|
||||||
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.lang.zh import Chinese
|
> from spacy.lang.zh import Chinese
|
||||||
>
|
>
|
||||||
> # Character segmentation (default)
|
> # Character segmentation (default)
|
||||||
> nlp = Chinese()
|
> nlp = Chinese()
|
||||||
>
|
|
||||||
> # Jieba
|
> # Jieba
|
||||||
> cfg = {"segmenter": "jieba"}
|
> cfg = {"segmenter": "jieba"}
|
||||||
> nlp = Chinese(meta={"tokenizer": {"config": cfg}})
|
> nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
|
||||||
>
|
|
||||||
> # PKUSeg with "default" model provided by pkuseg
|
> # PKUSeg with "default" model provided by pkuseg
|
||||||
> cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"}
|
> cfg = {"segmenter": "pkuseg"}
|
||||||
> nlp = Chinese(meta={"tokenizer": {"config": cfg}})
|
> nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
|
||||||
|
> nlp.tokenizer.initialize(pkuseg_model="default")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
1. **Character segmentation:** Character segmentation is the default
|
```ini
|
||||||
segmentation option. It's enabled when you create a new `Chinese` language
|
### config.cfg
|
||||||
class or call `spacy.blank("zh")`.
|
[nlp.tokenizer]
|
||||||
2. **Jieba:** `Chinese` uses [Jieba](https://github.com/fxsjy/jieba) for word
|
@tokenizers = "spacy.zh.ChineseTokenizer"
|
||||||
segmentation with the tokenizer option `{"segmenter": "jieba"}`.
|
segmenter = "char"
|
||||||
3. **PKUSeg**: As of spaCy v2.3.0, support for
|
|
||||||
[PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support
|
|
||||||
better segmentation for Chinese OntoNotes and the provided
|
|
||||||
[Chinese pipelines](/models/zh). Enable PKUSeg with the tokenizer option
|
|
||||||
`{"segmenter": "pkuseg"}`.
|
|
||||||
|
|
||||||
<Infobox variant="warning">
|
|
||||||
|
|
||||||
In spaCy v3.0, the default Chinese word segmenter has switched from Jieba to
|
|
||||||
character segmentation. Also note that
|
|
||||||
[`pkuseg`](https://github.com/lancopku/pkuseg-python) doesn't yet ship with
|
|
||||||
pre-compiled wheels for Python 3.8. If you're running Python 3.8, you can
|
|
||||||
install it from our fork and compile it locally:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
$ pip install https://github.com/honnibal/pkuseg-python/archive/master.zip
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
| Segmenter | Description |
|
||||||
|
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| `char` | **Character segmentation:** Character segmentation is the default segmentation option. It's enabled when you create a new `Chinese` language class or call `spacy.blank("zh")`. |
|
||||||
|
| `jieba` | **Jieba:** to use [Jieba](https://github.com/fxsjy/jieba) for word segmentation, you can set the option `segmenter` to `"jieba"`. |
|
||||||
|
| `pkuseg` | **PKUSeg**: As of spaCy v2.3.0, support for [PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support better segmentation for Chinese OntoNotes and the provided [Chinese pipelines](/models/zh). Enable PKUSeg by setting tokenizer option `segmenter` to `"pkuseg"`. |
|
||||||
|
|
||||||
|
<Infobox title="Changed in v3.0" variant="warning">
|
||||||
|
|
||||||
|
In v3.0, the default word segmenter has switched from Jieba to character
|
||||||
|
segmentation. Because the `pkuseg` segmenter depends on a model that can be
|
||||||
|
loaded from a file, the model is loaded on
|
||||||
|
[initialization](/usage/training#config-lifecycle) (typically before training).
|
||||||
|
This ensures that your packaged Chinese model doesn't depend on a local path at
|
||||||
|
runtime.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
<Accordion title="Details on spaCy's Chinese API">
|
<Accordion title="Details on spaCy's Chinese API">
|
||||||
|
|
||||||
The `meta` argument of the `Chinese` language class supports the following
|
The `initialize` method for the Chinese tokenizer class supports the following
|
||||||
following tokenizer config settings:
|
config settings for loading `pkuseg` models:
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------ | --------------------------------------------------------------------------------------------------------------- |
|
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `segmenter` | Word segmenter: `char`, `jieba` or `pkuseg`. Defaults to `char`. ~~str~~ |
|
| `pkuseg_model` | Name of a model provided by `pkuseg` or the path to a local model directory. ~~str~~ |
|
||||||
| `pkuseg_model` | **Required for `pkuseg`:** Name of a model provided by `pkuseg` or the path to a local model directory. ~~str~~ |
|
| `pkuseg_user_dict` | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. Defaults to `"default"`. ~~str~~ |
|
||||||
| `pkuseg_user_dict` | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. ~~str~~ |
|
|
||||||
|
The initialization settings are typically provided in the
|
||||||
|
[training config](/usage/training#config) and the data is loaded in before
|
||||||
|
training and serialized with the model. This allows you to load the data from a
|
||||||
|
local path and save out your pipeline and config, without requiring the same
|
||||||
|
local path at runtime. See the usage guide on the
|
||||||
|
[config lifecycle](/usage/training#config-lifecycle) for more background on
|
||||||
|
this.
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### config.cfg
|
||||||
|
[initialize]
|
||||||
|
|
||||||
|
[initialize.tokenizer]
|
||||||
|
pkuseg_model = "/path/to/model"
|
||||||
|
pkuseg_user_dict = "default"
|
||||||
|
```
|
||||||
|
|
||||||
|
You can also initialize the tokenizer for a blank language class by calling its
|
||||||
|
`initialize` method:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### Examples
|
### Examples
|
||||||
|
# Initialize the pkuseg tokenizer
|
||||||
|
cfg = {"segmenter": "pkuseg"}
|
||||||
|
nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
|
||||||
|
|
||||||
# Load "default" model
|
# Load "default" model
|
||||||
cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"}
|
nlp.tokenizer.initialize(pkuseg_model="default")
|
||||||
nlp = Chinese(config={"tokenizer": {"config": cfg}})
|
|
||||||
|
|
||||||
# Load local model
|
# Load local model
|
||||||
cfg = {"segmenter": "pkuseg", "pkuseg_model": "/path/to/pkuseg_model"}
|
nlp.tokenizer.initialize(pkuseg_model="/path/to/pkuseg_model")
|
||||||
nlp = Chinese(config={"tokenizer": {"config": cfg}})
|
|
||||||
|
|
||||||
# Override the user directory
|
# Override the user directory
|
||||||
cfg = {"segmenter": "pkuseg", "pkuseg_model": "default", "pkuseg_user_dict": "/path"}
|
nlp.tokenizer.initialize(pkuseg_model="default", pkuseg_user_dict="/path/to/user_dict")
|
||||||
nlp = Chinese(config={"tokenizer": {"config": cfg}})
|
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also modify the user dictionary on-the-fly:
|
You can also modify the user dictionary on-the-fly:
|
||||||
|
@ -185,36 +206,46 @@ from spacy.lang.zh import Chinese
|
||||||
|
|
||||||
# Train pkuseg model
|
# Train pkuseg model
|
||||||
pkuseg.train("train.utf8", "test.utf8", "/path/to/pkuseg_model")
|
pkuseg.train("train.utf8", "test.utf8", "/path/to/pkuseg_model")
|
||||||
|
|
||||||
# Load pkuseg model in spaCy Chinese tokenizer
|
# Load pkuseg model in spaCy Chinese tokenizer
|
||||||
nlp = Chinese(meta={"tokenizer": {"config": {"pkuseg_model": "/path/to/pkuseg_model", "require_pkuseg": True}}})
|
cfg = {"segmenter": "pkuseg"}
|
||||||
|
nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
|
||||||
|
nlp.tokenizer.initialize(pkuseg_model="/path/to/pkuseg_model")
|
||||||
```
|
```
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
### Japanese language support {#japanese new=2.3}
|
### Japanese language support {#japanese new=2.3}
|
||||||
|
|
||||||
|
> #### Manual setup
|
||||||
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.lang.ja import Japanese
|
> from spacy.lang.ja import Japanese
|
||||||
>
|
>
|
||||||
> # Load SudachiPy with split mode A (default)
|
> # Load SudachiPy with split mode A (default)
|
||||||
> nlp = Japanese()
|
> nlp = Japanese()
|
||||||
>
|
|
||||||
> # Load SudachiPy with split mode B
|
> # Load SudachiPy with split mode B
|
||||||
> cfg = {"split_mode": "B"}
|
> cfg = {"split_mode": "B"}
|
||||||
> nlp = Japanese(meta={"tokenizer": {"config": cfg}})
|
> nlp = Japanese.from_config({"nlp": {"tokenizer": cfg}})
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
The Japanese language class uses
|
The Japanese language class uses
|
||||||
[SudachiPy](https://github.com/WorksApplications/SudachiPy) for word
|
[SudachiPy](https://github.com/WorksApplications/SudachiPy) for word
|
||||||
segmentation and part-of-speech tagging. The default Japanese language class and
|
segmentation and part-of-speech tagging. The default Japanese language class and
|
||||||
the provided Japanese pipelines use SudachiPy split mode `A`. The `meta`
|
the provided Japanese pipelines use SudachiPy split mode `A`. The tokenizer
|
||||||
argument of the `Japanese` language class can be used to configure the split
|
config can be used to configure the split mode to `A`, `B` or `C`.
|
||||||
mode to `A`, `B` or `C`.
|
|
||||||
|
```ini
|
||||||
|
### config.cfg
|
||||||
|
[nlp.tokenizer]
|
||||||
|
@tokenizers = "spacy.ja.JapaneseTokenizer"
|
||||||
|
split_mode = "A"
|
||||||
|
```
|
||||||
|
|
||||||
<Infobox variant="warning">
|
<Infobox variant="warning">
|
||||||
|
|
||||||
If you run into errors related to `sudachipy`, which is currently under active
|
If you run into errors related to `sudachipy`, which is currently under active
|
||||||
development, we suggest downgrading to `sudachipy==0.4.5`, which is the version
|
development, we suggest downgrading to `sudachipy==0.4.9`, which is the version
|
||||||
used for training the current [Japanese pipelines](/models/ja).
|
used for training the current [Japanese pipelines](/models/ja).
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
|
@ -895,6 +895,10 @@ the name. Registered functions can also take **arguments** by the way that can
|
||||||
be defined in the config as well – you can read more about this in the docs on
|
be defined in the config as well – you can read more about this in the docs on
|
||||||
[training with custom code](/usage/training#custom-code).
|
[training with custom code](/usage/training#custom-code).
|
||||||
|
|
||||||
|
### Initializing components with data {#initialization}
|
||||||
|
|
||||||
|
<!-- TODO: -->
|
||||||
|
|
||||||
### Python type hints and pydantic validation {#type-hints new="3"}
|
### Python type hints and pydantic validation {#type-hints new="3"}
|
||||||
|
|
||||||
spaCy's configs are powered by our machine learning library Thinc's
|
spaCy's configs are powered by our machine learning library Thinc's
|
||||||
|
|
|
@ -291,7 +291,7 @@ installed in the same environment – that's it.
|
||||||
| Entry point | Description |
|
| Entry point | Description |
|
||||||
| ------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| [`spacy_factories`](#entry-points-components) | Group of entry points for pipeline component factories, keyed by component name. Can be used to expose custom components defined by another package. |
|
| [`spacy_factories`](#entry-points-components) | Group of entry points for pipeline component factories, keyed by component name. Can be used to expose custom components defined by another package. |
|
||||||
| [`spacy_languages`](#entry-points-languages) | Group of entry points for custom [`Language` subclasses](/usage/adding-languages), keyed by language shortcut. |
|
| [`spacy_languages`](#entry-points-languages) | Group of entry points for custom [`Language` subclasses](/usage/linguistic-features#language-data), keyed by language shortcut. |
|
||||||
| `spacy_lookups` <Tag variant="new">2.2</Tag> | Group of entry points for custom [`Lookups`](/api/lookups), including lemmatizer data. Used by spaCy's [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) package. |
|
| `spacy_lookups` <Tag variant="new">2.2</Tag> | Group of entry points for custom [`Lookups`](/api/lookups), including lemmatizer data. Used by spaCy's [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) package. |
|
||||||
| [`spacy_displacy_colors`](#entry-points-displacy) <Tag variant="new">2.2</Tag> | Group of entry points of custom label colors for the [displaCy visualizer](/usage/visualizers#ent). The key name doesn't matter, but it should point to a dict of labels and color values. Useful for custom models that predict different entity types. |
|
| [`spacy_displacy_colors`](#entry-points-displacy) <Tag variant="new">2.2</Tag> | Group of entry points of custom label colors for the [displaCy visualizer](/usage/visualizers#ent). The key name doesn't matter, but it should point to a dict of labels and color values. Useful for custom models that predict different entity types. |
|
||||||
|
|
||||||
|
|
|
@ -200,7 +200,7 @@ import Tokenization101 from 'usage/101/\_tokenization.md'
|
||||||
To learn more about how spaCy's tokenization rules work in detail, how to
|
To learn more about how spaCy's tokenization rules work in detail, how to
|
||||||
**customize and replace** the default tokenizer and how to **add
|
**customize and replace** the default tokenizer and how to **add
|
||||||
language-specific data**, see the usage guides on
|
language-specific data**, see the usage guides on
|
||||||
[adding languages](/usage/adding-languages) and
|
[language data](/usage/linguistic-features#language-data) and
|
||||||
[customizing the tokenizer](/usage/linguistic-features#tokenization).
|
[customizing the tokenizer](/usage/linguistic-features#tokenization).
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
@ -479,7 +479,7 @@ find a "Suggest edits" link at the bottom of each page that points you to the
|
||||||
source.
|
source.
|
||||||
|
|
||||||
Another way of getting involved is to help us improve the
|
Another way of getting involved is to help us improve the
|
||||||
[language data](/usage/adding-languages#language-data) – especially if you
|
[language data](/usage/linguistic-features#language-data) – especially if you
|
||||||
happen to speak one of the languages currently in
|
happen to speak one of the languages currently in
|
||||||
[alpha support](/usage/models#languages). Even adding simple tokenizer
|
[alpha support](/usage/models#languages). Even adding simple tokenizer
|
||||||
exceptions, stop words or lemmatizer data can make a big difference. It will
|
exceptions, stop words or lemmatizer data can make a big difference. It will
|
||||||
|
|
|
@ -216,7 +216,9 @@ The initialization settings are only loaded and used when
|
||||||
[`nlp.initialize`](/api/language#initialize) is called (typically right before
|
[`nlp.initialize`](/api/language#initialize) is called (typically right before
|
||||||
training). This allows you to set up your pipeline using local data resources
|
training). This allows you to set up your pipeline using local data resources
|
||||||
and custom functions, and preserve the information in your config – but without
|
and custom functions, and preserve the information in your config – but without
|
||||||
requiring it to be available at runtime
|
requiring it to be available at runtime. You can also use this mechanism to
|
||||||
|
provide data paths to custom pipeline components and custom tokenizers – see the
|
||||||
|
section on [custom initialization](#initialization) for details.
|
||||||
|
|
||||||
### Overwriting config settings on the command line {#config-overrides}
|
### Overwriting config settings on the command line {#config-overrides}
|
||||||
|
|
||||||
|
@ -815,9 +817,9 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
|
||||||
return create_model(output_width)
|
return create_model(output_width)
|
||||||
```
|
```
|
||||||
|
|
||||||
<!-- TODO:
|
|
||||||
### Customizing the initialization {#initialization}
|
### Customizing the initialization {#initialization}
|
||||||
-->
|
|
||||||
|
<!-- TODO: -->
|
||||||
|
|
||||||
## Data utilities {#data}
|
## Data utilities {#data}
|
||||||
|
|
||||||
|
@ -1011,9 +1013,136 @@ def filter_batch(size: int) -> Callable[[Iterable[Example]], Iterator[List[Examp
|
||||||
<!-- TODO:
|
<!-- TODO:
|
||||||
* Custom corpus class
|
* Custom corpus class
|
||||||
* Minibatching
|
* Minibatching
|
||||||
* Data augmentation
|
|
||||||
-->
|
-->
|
||||||
|
|
||||||
|
### Data augmentation {#data-augmentation}
|
||||||
|
|
||||||
|
Data augmentation is the process of applying small **modifications** to the
|
||||||
|
training data. It can be especially useful for punctuation and case replacement
|
||||||
|
– for example, if your corpus only uses smart quotes and you want to include
|
||||||
|
variations using regular quotes, or to make the model less sensitive to
|
||||||
|
capitalization by including a mix of capitalized and lowercase examples.
|
||||||
|
|
||||||
|
The easiest way to use data augmentation during training is to provide an
|
||||||
|
`augmenter` to the training corpus, e.g. in the `[corpora.train]` section of
|
||||||
|
your config. The built-in [`orth_variants`](/api/top-level#orth_variants)
|
||||||
|
augmenter creates a data augmentation callback that uses orth-variant
|
||||||
|
replacement.
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### config.cfg (excerpt) {highlight="8,14"}
|
||||||
|
[corpora.train]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.train}
|
||||||
|
gold_preproc = false
|
||||||
|
max_length = 0
|
||||||
|
limit = 0
|
||||||
|
|
||||||
|
[corpora.train.augmenter]
|
||||||
|
@augmenters = "spacy.orth_variants.v1"
|
||||||
|
# Percentage of texts that will be augmented / lowercased
|
||||||
|
level = 0.1
|
||||||
|
lower = 0.5
|
||||||
|
|
||||||
|
[corpora.train.augmenter.orth_variants]
|
||||||
|
@readers = "srsly.read_json.v1"
|
||||||
|
path = "corpus/orth_variants.json"
|
||||||
|
```
|
||||||
|
|
||||||
|
The `orth_variants` argument lets you pass in a dictionary of replacement rules,
|
||||||
|
typically loaded from a JSON file. There are two types of orth variant rules:
|
||||||
|
`"single"` for single tokens that should be replaced (e.g. hyphens) and
|
||||||
|
`"paired"` for pairs of tokens (e.g. quotes).
|
||||||
|
|
||||||
|
<!-- prettier-ignore -->
|
||||||
|
```json
|
||||||
|
### orth_variants.json
|
||||||
|
{
|
||||||
|
"single": [{ "tags": ["NFP"], "variants": ["…", "..."] }],
|
||||||
|
"paired": [{ "tags": ["``", "''"], "variants": [["'", "'"], ["‘", "’"]] }]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
<Accordion title="Full examples for English and German" spaced>
|
||||||
|
|
||||||
|
```json
|
||||||
|
https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json
|
||||||
|
```
|
||||||
|
|
||||||
|
```json
|
||||||
|
https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/de_orth_variants.json
|
||||||
|
```
|
||||||
|
|
||||||
|
</Accordion>
|
||||||
|
|
||||||
|
<Infobox title="Important note" variant="warning">
|
||||||
|
|
||||||
|
When adding data augmentation, keep in mind that it typically only makes sense
|
||||||
|
to apply it to the **training corpus**, not the development data.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
#### Writing custom data augmenters {#data-augmentation-custom}
|
||||||
|
|
||||||
|
Using the [`@spacy.augmenters`](/api/top-level#registry) registry, you can also
|
||||||
|
register your own data augmentation callbacks. The callback should be a function
|
||||||
|
that takes the current `nlp` object and a training [`Example`](/api/example) and
|
||||||
|
yields `Example` objects. Keep in mind that the augmenter should yield **all
|
||||||
|
examples** you want to use in your corpus, not only the augmented examples
|
||||||
|
(unless you want to augment all examples).
|
||||||
|
|
||||||
|
Here'a an example of a custom augmentation callback that produces text variants
|
||||||
|
in ["SpOnGeBoB cAsE"](https://knowyourmeme.com/memes/mocking-spongebob). The
|
||||||
|
registered function takes one argument `randomize` that can be set via the
|
||||||
|
config and decides whether the uppercase/lowercase transformation is applied
|
||||||
|
randomly or not. The augmenter yields two `Example` objects: the original
|
||||||
|
example and the augmented example.
|
||||||
|
|
||||||
|
> #### config.cfg
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [corpora.train.augmenter]
|
||||||
|
> @augmenters = "spongebob_augmenter.v1"
|
||||||
|
> randomize = false
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```python
|
||||||
|
import spacy
|
||||||
|
import random
|
||||||
|
|
||||||
|
@spacy.registry.augmenters("spongebob_augmenter.v1")
|
||||||
|
def create_augmenter(randomize: bool = False):
|
||||||
|
def augment(nlp, example):
|
||||||
|
text = example.text
|
||||||
|
if randomize:
|
||||||
|
# Randomly uppercase/lowercase characters
|
||||||
|
chars = [c.lower() if random.random() < 0.5 else c.upper() for c in text]
|
||||||
|
else:
|
||||||
|
# Uppercase followed by lowercase
|
||||||
|
chars = [c.lower() if i % 2 else c.upper() for i, c in enumerate(text)]
|
||||||
|
# Create augmented training example
|
||||||
|
example_dict = example.to_dict()
|
||||||
|
doc = nlp.make_doc("".join(chars))
|
||||||
|
example_dict["token_annotation"]["ORTH"] = [t.text for t in doc]
|
||||||
|
# Original example followed by augmented example
|
||||||
|
yield example
|
||||||
|
yield example.from_dict(doc, example_dict)
|
||||||
|
|
||||||
|
return augment
|
||||||
|
```
|
||||||
|
|
||||||
|
An easy way to create modified `Example` objects is to use the
|
||||||
|
[`Example.from_dict`](/api/example#from_dict) method with a new reference
|
||||||
|
[`Doc`](/api/doc) created from the modified text. In this case, only the
|
||||||
|
capitalization changes, so only the `ORTH` values of the tokens will be
|
||||||
|
different between the original and augmented examples.
|
||||||
|
|
||||||
|
Note that if your data augmentation strategy involves changing the tokenization
|
||||||
|
(for instance, removing or adding tokens) and your training examples include
|
||||||
|
token-based annotations like the dependency parse or entity labels, you'll need
|
||||||
|
to take care to adjust the `Example` object so its annotations match and remain
|
||||||
|
valid.
|
||||||
|
|
||||||
## Parallel & distributed training with Ray {#parallel-training}
|
## Parallel & distributed training with Ray {#parallel-training}
|
||||||
|
|
||||||
> #### Installation
|
> #### Installation
|
||||||
|
@ -1124,17 +1253,6 @@ a dictionary with keyword arguments specifying the annotations, like `tags` or
|
||||||
annotations, the model can be updated to learn a sentence of three words with
|
annotations, the model can be updated to learn a sentence of three words with
|
||||||
their assigned part-of-speech tags.
|
their assigned part-of-speech tags.
|
||||||
|
|
||||||
> #### About the tag map
|
|
||||||
>
|
|
||||||
> The tag map is part of the vocabulary and defines the annotation scheme. If
|
|
||||||
> you're training a new pipeline, this will let you map the tags present in the
|
|
||||||
> treebank you train on to spaCy's tag scheme:
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> tag_map = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}}
|
|
||||||
> vocab = Vocab(tag_map=tag_map)
|
|
||||||
> ```
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
words = ["I", "like", "stuff"]
|
words = ["I", "like", "stuff"]
|
||||||
tags = ["NOUN", "VERB", "NOUN"]
|
tags = ["NOUN", "VERB", "NOUN"]
|
||||||
|
|
|
@ -24,7 +24,7 @@
|
||||||
"TransformerData": "/api/transformer#transformerdata",
|
"TransformerData": "/api/transformer#transformerdata",
|
||||||
"FullTransformerBatch": "/api/transformer#fulltransformerbatch",
|
"FullTransformerBatch": "/api/transformer#fulltransformerbatch",
|
||||||
"Corpus": "/api/corpus",
|
"Corpus": "/api/corpus",
|
||||||
"JsonlTexts": "/api/corpus#jsonltexts",
|
"JsonlCorpus": "/api/corpus#jsonlcorpus",
|
||||||
"LexemeC": "/api/cython-structs#lexemec",
|
"LexemeC": "/api/cython-structs#lexemec",
|
||||||
"TokenC": "/api/cython-structs#tokenc",
|
"TokenC": "/api/cython-structs#tokenc",
|
||||||
"Config": "https://thinc.ai/docs/api-config#config",
|
"Config": "https://thinc.ai/docs/api-config#config",
|
||||||
|
|
|
@ -24,6 +24,7 @@ const Quickstart = ({
|
||||||
rawContent = null,
|
rawContent = null,
|
||||||
id = 'quickstart',
|
id = 'quickstart',
|
||||||
setters = {},
|
setters = {},
|
||||||
|
showDropdown = {},
|
||||||
hidePrompts,
|
hidePrompts,
|
||||||
small,
|
small,
|
||||||
codeLang,
|
codeLang,
|
||||||
|
@ -107,6 +108,8 @@ const Quickstart = ({
|
||||||
}) => {
|
}) => {
|
||||||
// Optional function that's called with the value
|
// Optional function that's called with the value
|
||||||
const setterFunc = setters[id] || (() => {})
|
const setterFunc = setters[id] || (() => {})
|
||||||
|
// Check if dropdown should be shown
|
||||||
|
const dropdownGetter = showDropdown[id] || (() => true)
|
||||||
return (
|
return (
|
||||||
<div key={id} data-quickstart-group={id} className={classes.group}>
|
<div key={id} data-quickstart-group={id} className={classes.group}>
|
||||||
<style data-quickstart-style={id} scoped>
|
<style data-quickstart-style={id} scoped>
|
||||||
|
@ -123,37 +126,6 @@ const Quickstart = ({
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
<div className={classes.fields}>
|
<div className={classes.fields}>
|
||||||
{!!dropdown.length && (
|
|
||||||
<select
|
|
||||||
defaultValue={defaultValue}
|
|
||||||
className={classes.select}
|
|
||||||
onChange={({ target }) => {
|
|
||||||
const value = target.value
|
|
||||||
if (value != other) {
|
|
||||||
setterFunc(value)
|
|
||||||
setOther(id, false)
|
|
||||||
} else {
|
|
||||||
setterFunc('')
|
|
||||||
setOther(id, true)
|
|
||||||
}
|
|
||||||
}}
|
|
||||||
>
|
|
||||||
{dropdown.map(({ id, title }) => (
|
|
||||||
<option key={id} value={id}>
|
|
||||||
{title}
|
|
||||||
</option>
|
|
||||||
))}
|
|
||||||
{other && <option value={other}>{other}</option>}
|
|
||||||
</select>
|
|
||||||
)}
|
|
||||||
{other && otherState[id] && (
|
|
||||||
<input
|
|
||||||
type="text"
|
|
||||||
className={classes.textInput}
|
|
||||||
placeholder="Type here..."
|
|
||||||
onChange={({ target }) => setterFunc(target.value)}
|
|
||||||
/>
|
|
||||||
)}
|
|
||||||
{options.map(option => {
|
{options.map(option => {
|
||||||
const optionType = multiple ? 'checkbox' : 'radio'
|
const optionType = multiple ? 'checkbox' : 'radio'
|
||||||
const checkedForId = checked[id] || []
|
const checkedForId = checked[id] || []
|
||||||
|
@ -179,7 +151,10 @@ const Quickstart = ({
|
||||||
type={optionType}
|
type={optionType}
|
||||||
className={classNames(
|
className={classNames(
|
||||||
classes.input,
|
classes.input,
|
||||||
classes[optionType]
|
classes[optionType],
|
||||||
|
{
|
||||||
|
[classes.long]: options.length >= 4,
|
||||||
|
}
|
||||||
)}
|
)}
|
||||||
name={id}
|
name={id}
|
||||||
id={`quickstart-${option.id}`}
|
id={`quickstart-${option.id}`}
|
||||||
|
@ -209,6 +184,41 @@ const Quickstart = ({
|
||||||
</Fragment>
|
</Fragment>
|
||||||
)
|
)
|
||||||
})}
|
})}
|
||||||
|
<span className={classes.fieldExtra}>
|
||||||
|
{!!dropdown.length && (
|
||||||
|
<select
|
||||||
|
defaultValue={defaultValue}
|
||||||
|
className={classNames(classes.select, {
|
||||||
|
[classes.selectHidden]: !dropdownGetter(),
|
||||||
|
})}
|
||||||
|
onChange={({ target }) => {
|
||||||
|
const value = target.value
|
||||||
|
if (value != other) {
|
||||||
|
setterFunc(value)
|
||||||
|
setOther(id, false)
|
||||||
|
} else {
|
||||||
|
setterFunc('')
|
||||||
|
setOther(id, true)
|
||||||
|
}
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
{dropdown.map(({ id, title }) => (
|
||||||
|
<option key={id} value={id}>
|
||||||
|
{title}
|
||||||
|
</option>
|
||||||
|
))}
|
||||||
|
{other && <option value={other}>{other}</option>}
|
||||||
|
</select>
|
||||||
|
)}
|
||||||
|
{other && otherState[id] && (
|
||||||
|
<input
|
||||||
|
type="text"
|
||||||
|
className={classes.textInput}
|
||||||
|
placeholder="Type here..."
|
||||||
|
onChange={({ target }) => setterFunc(target.value)}
|
||||||
|
/>
|
||||||
|
)}
|
||||||
|
</span>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
)
|
)
|
||||||
|
|
|
@ -36,22 +36,37 @@
|
||||||
|
|
||||||
.label
|
.label
|
||||||
cursor: pointer
|
cursor: pointer
|
||||||
border: 1px solid var(--color-subtle)
|
|
||||||
border-radius: var(--border-radius)
|
|
||||||
display: inline-block
|
display: inline-block
|
||||||
padding: 0.65rem 1.25rem
|
padding: 0.35rem 0.5rem 0.25rem 0
|
||||||
margin: 0 0.5rem 0.75rem 0
|
margin: 0 1rem 0.75rem 0
|
||||||
font-size: var(--font-size-xs)
|
font-size: var(--font-size-xs)
|
||||||
font-weight: bold
|
font-weight: bold
|
||||||
background: var(--color-back)
|
|
||||||
|
|
||||||
&:hover
|
&:hover
|
||||||
background: var(--color-subtle-light)
|
background: var(--color-subtle-light)
|
||||||
|
|
||||||
.input:focus + &
|
.input:focus +
|
||||||
border: 1px solid var(--color-theme)
|
border: 1px solid var(--color-theme)
|
||||||
outline: none
|
outline: none
|
||||||
|
|
||||||
|
.radio + &
|
||||||
|
margin: 0 0 0.75rem 0
|
||||||
|
border-radius: 0
|
||||||
|
border-width: 1px 0 1px 1px
|
||||||
|
border-style: solid
|
||||||
|
border-color: var(--color-subtle)
|
||||||
|
background: var(--color-back)
|
||||||
|
padding: 0.65rem 1.25rem
|
||||||
|
|
||||||
|
&:nth-child(2) // first child is checkbox
|
||||||
|
border-top-left-radius: var(--border-radius)
|
||||||
|
border-bottom-left-radius: var(--border-radius)
|
||||||
|
|
||||||
|
&:nth-last-child(2) // last child is additional container
|
||||||
|
border-top-right-radius: var(--border-radius)
|
||||||
|
border-bottom-right-radius: var(--border-radius)
|
||||||
|
border-right-width: 1px
|
||||||
|
|
||||||
.radio:checked + &
|
.radio:checked + &
|
||||||
color: var(--color-back)
|
color: var(--color-back)
|
||||||
border-color: var(--color-theme)
|
border-color: var(--color-theme)
|
||||||
|
@ -64,9 +79,10 @@
|
||||||
height: 20px
|
height: 20px
|
||||||
border: 1px solid var(--color-subtle)
|
border: 1px solid var(--color-subtle)
|
||||||
vertical-align: middle
|
vertical-align: middle
|
||||||
margin-right: 1rem
|
margin-right: 0.5rem
|
||||||
cursor: pointer
|
cursor: pointer
|
||||||
border-radius: var(--border-radius)
|
border-radius: var(--border-radius)
|
||||||
|
background: var(--color-back)
|
||||||
|
|
||||||
.checkbox:checked + &:before
|
.checkbox:checked + &:before
|
||||||
// Embed "check" icon here for simplicity
|
// Embed "check" icon here for simplicity
|
||||||
|
@ -74,6 +90,9 @@
|
||||||
background-size: contain
|
background-size: contain
|
||||||
border-color: var(--color-theme)
|
border-color: var(--color-theme)
|
||||||
|
|
||||||
|
.field-extra:not(:empty):not(:first-child)
|
||||||
|
margin-left: 1rem
|
||||||
|
|
||||||
.legend
|
.legend
|
||||||
color: var(--color-dark)
|
color: var(--color-dark)
|
||||||
padding: 0.75rem 0
|
padding: 0.75rem 0
|
||||||
|
@ -93,6 +112,9 @@
|
||||||
font-size: var(--font-size-sm)
|
font-size: var(--font-size-sm)
|
||||||
background: var(--color-back)
|
background: var(--color-back)
|
||||||
|
|
||||||
|
.select-hidden
|
||||||
|
display: none
|
||||||
|
|
||||||
.text-input
|
.text-input
|
||||||
border: 1px solid var(--color-subtle)
|
border: 1px solid var(--color-subtle)
|
||||||
border-radius: var(--border-radius)
|
border-radius: var(--border-radius)
|
||||||
|
|
|
@ -1,9 +1,20 @@
|
||||||
import React from 'react'
|
import React, { useState } from 'react'
|
||||||
import { StaticQuery, graphql } from 'gatsby'
|
import { StaticQuery, graphql } from 'gatsby'
|
||||||
|
|
||||||
import { Quickstart, QS } from '../components/quickstart'
|
import { Quickstart, QS } from '../components/quickstart'
|
||||||
import { repo } from '../components/util'
|
import { repo } from '../components/util'
|
||||||
|
|
||||||
|
const DEFAULT_HARDWARE = 'cpu'
|
||||||
|
const DEFAULT_CUDA = 'cuda100'
|
||||||
|
const CUDA = {
|
||||||
|
'8.0': 'cuda80',
|
||||||
|
'9.0': 'cuda90',
|
||||||
|
'9.1': 'cuda91',
|
||||||
|
'9.2': 'cuda92',
|
||||||
|
'10.0': 'cuda100',
|
||||||
|
'10.1': 'cuda101',
|
||||||
|
'10.2': 'cuda102',
|
||||||
|
}
|
||||||
const DATA = [
|
const DATA = [
|
||||||
{
|
{
|
||||||
id: 'os',
|
id: 'os',
|
||||||
|
@ -23,6 +34,16 @@ const DATA = [
|
||||||
{ id: 'source', title: 'from source' },
|
{ id: 'source', title: 'from source' },
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
id: 'hardware',
|
||||||
|
title: 'Hardware',
|
||||||
|
options: [
|
||||||
|
{ id: 'cpu', title: 'CPU', checked: DEFAULT_HARDWARE === 'cpu' },
|
||||||
|
{ id: 'gpu', title: 'GPU', checked: DEFAULT_HARDWARE == 'gpu' },
|
||||||
|
],
|
||||||
|
dropdown: Object.keys(CUDA).map(id => ({ id: CUDA[id], title: `CUDA ${id}` })),
|
||||||
|
defaultValue: DEFAULT_CUDA,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
id: 'config',
|
id: 'config',
|
||||||
title: 'Configuration',
|
title: 'Configuration',
|
||||||
|
@ -30,100 +51,115 @@ const DATA = [
|
||||||
options: [
|
options: [
|
||||||
{
|
{
|
||||||
id: 'venv',
|
id: 'venv',
|
||||||
title: 'virtualenv',
|
title: 'virtual env',
|
||||||
help: 'Use a virtual environment and install spaCy into a user directory',
|
help: 'Use a virtual environment and install spaCy into a user directory',
|
||||||
},
|
},
|
||||||
],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
id: 'addition',
|
|
||||||
title: 'Additions',
|
|
||||||
multiple: true,
|
|
||||||
options: [
|
|
||||||
{
|
{
|
||||||
id: 'transformers',
|
id: 'train',
|
||||||
title: 'Transformers',
|
title: 'train models',
|
||||||
help: 'Use transformers like BERT to train your spaCy pipelines',
|
help:
|
||||||
},
|
'Check this if you plan to train your own models with spaCy to install extra dependencies and data resources',
|
||||||
{
|
|
||||||
id: 'lookups',
|
|
||||||
title: 'Lemmatizer data',
|
|
||||||
help: 'Install additional lookup tables and rules for lemmatization',
|
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
const QuickstartInstall = ({ id, title }) => (
|
const QuickstartInstall = ({ id, title }) => {
|
||||||
<StaticQuery
|
const [train, setTrain] = useState(false)
|
||||||
query={query}
|
const [hardware, setHardware] = useState(DEFAULT_HARDWARE)
|
||||||
render={({ site }) => {
|
const [cuda, setCuda] = useState(DEFAULT_CUDA)
|
||||||
const { nightly, languages } = site.siteMetadata
|
const setters = {
|
||||||
const models = languages.filter(({ models }) => models !== null)
|
hardware: v => (Array.isArray(v) ? setHardware(v[0]) : setCuda(v)),
|
||||||
const data = [
|
config: v => setTrain(v.includes('train')),
|
||||||
...DATA,
|
}
|
||||||
{
|
const showDropdown = {
|
||||||
id: 'models',
|
hardware: () => hardware === 'gpu',
|
||||||
title: 'Trained Pipelines',
|
}
|
||||||
multiple: true,
|
const pipExtras = [hardware === 'gpu' && cuda, train && 'transformers', train && 'lookups']
|
||||||
options: models.map(({ code, name }) => ({ id: code, title: name })),
|
.filter(e => e)
|
||||||
},
|
.join(',')
|
||||||
]
|
return (
|
||||||
return (
|
<StaticQuery
|
||||||
<Quickstart data={data} title={title} id={id}>
|
query={query}
|
||||||
<QS config="venv">python -m venv .env</QS>
|
render={({ site }) => {
|
||||||
<QS config="venv" os="mac">
|
const { nightly, languages } = site.siteMetadata
|
||||||
source .env/bin/activate
|
const pkg = nightly ? 'spacy-nightly' : 'spacy'
|
||||||
</QS>
|
const models = languages.filter(({ models }) => models !== null)
|
||||||
<QS config="venv" os="linux">
|
const data = [
|
||||||
source .env/bin/activate
|
...DATA,
|
||||||
</QS>
|
{
|
||||||
<QS config="venv" os="windows">
|
id: 'models',
|
||||||
.env\Scripts\activate
|
title: 'Trained Pipelines',
|
||||||
</QS>
|
multiple: true,
|
||||||
<QS package="pip">pip install -U spacy</QS>
|
options: models
|
||||||
<QS package="conda">conda install -c conda-forge spacy</QS>
|
.sort((a, b) => a.name.localeCompare(b.name))
|
||||||
<QS package="source">
|
.map(({ code, name }) => ({ id: code, title: name })),
|
||||||
git clone https://github.com/{repo}
|
},
|
||||||
{nightly ? ` --branch develop` : ''}
|
]
|
||||||
</QS>
|
return (
|
||||||
<QS package="source">cd spaCy</QS>
|
<Quickstart
|
||||||
<QS package="source" os="linux">
|
data={data}
|
||||||
export PYTHONPATH=`pwd`
|
title={title}
|
||||||
</QS>
|
id={id}
|
||||||
<QS package="source" os="windows">
|
setters={setters}
|
||||||
set PYTHONPATH=C:\path\to\spaCy
|
showDropdown={showDropdown}
|
||||||
</QS>
|
>
|
||||||
<QS package="source">pip install -r requirements.txt</QS>
|
<QS config="venv">python -m venv .env</QS>
|
||||||
<QS addition="transformers" package="pip">
|
<QS config="venv" os="mac">
|
||||||
pip install -U spacy-transformers
|
source .env/bin/activate
|
||||||
</QS>
|
|
||||||
<QS addition="transformers" package="source">
|
|
||||||
pip install -U spacy-transformers
|
|
||||||
</QS>
|
|
||||||
<QS addition="transformers" package="conda">
|
|
||||||
conda install -c conda-forge spacy-transformers
|
|
||||||
</QS>
|
|
||||||
<QS addition="lookups" package="pip">
|
|
||||||
pip install -U spacy-lookups-data
|
|
||||||
</QS>
|
|
||||||
<QS addition="lookups" package="source">
|
|
||||||
pip install -U spacy-lookups-data
|
|
||||||
</QS>
|
|
||||||
<QS addition="lookups" package="conda">
|
|
||||||
conda install -c conda-forge spacy-lookups-data
|
|
||||||
</QS>
|
|
||||||
<QS package="source">python setup.py build_ext --inplace</QS>
|
|
||||||
{models.map(({ code, models: modelOptions }) => (
|
|
||||||
<QS models={code} key={code}>
|
|
||||||
python -m spacy download {modelOptions[0]}
|
|
||||||
</QS>
|
</QS>
|
||||||
))}
|
<QS config="venv" os="linux">
|
||||||
</Quickstart>
|
source .env/bin/activate
|
||||||
)
|
</QS>
|
||||||
}}
|
<QS config="venv" os="windows">
|
||||||
/>
|
.env\Scripts\activate
|
||||||
)
|
</QS>
|
||||||
|
<QS package="pip">pip install -U pip setuptools wheel</QS>
|
||||||
|
<QS package="source">pip install -U pip setuptools wheel</QS>
|
||||||
|
<QS package="pip">
|
||||||
|
pip install -U {pkg}
|
||||||
|
{pipExtras && `[${pipExtras}]`}
|
||||||
|
{nightly ? ' --pre' : ''}
|
||||||
|
</QS>
|
||||||
|
<QS package="conda">conda install -c conda-forge spacy</QS>
|
||||||
|
<QS package="conda" hardware="gpu">
|
||||||
|
conda install -c conda-forge cupy
|
||||||
|
</QS>
|
||||||
|
<QS package="source">
|
||||||
|
git clone https://github.com/{repo}
|
||||||
|
{nightly ? ` --branch develop` : ''}
|
||||||
|
</QS>
|
||||||
|
<QS package="source">cd spaCy</QS>
|
||||||
|
<QS package="source" os="linux">
|
||||||
|
export PYTHONPATH=`pwd`
|
||||||
|
</QS>
|
||||||
|
<QS package="source" os="windows">
|
||||||
|
set PYTHONPATH=C:\path\to\spaCy
|
||||||
|
</QS>
|
||||||
|
<QS package="source">pip install -r requirements.txt</QS>
|
||||||
|
<QS package="source">python setup.py build_ext --inplace</QS>
|
||||||
|
<QS package="source" config="train">
|
||||||
|
pip install -e '.[{pipExtras}]'
|
||||||
|
</QS>
|
||||||
|
|
||||||
|
<QS config="train" package="conda">
|
||||||
|
conda install -c conda-forge spacy-transformers
|
||||||
|
</QS>
|
||||||
|
<QS config="train" package="conda">
|
||||||
|
conda install -c conda-forge spacy-lookups-data
|
||||||
|
</QS>
|
||||||
|
|
||||||
|
{models.map(({ code, models: modelOptions }) => (
|
||||||
|
<QS models={code} key={code}>
|
||||||
|
python -m spacy download {modelOptions[0]}
|
||||||
|
</QS>
|
||||||
|
))}
|
||||||
|
</Quickstart>
|
||||||
|
)
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
export default QuickstartInstall
|
export default QuickstartInstall
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,16 @@
|
||||||
import React, { Fragment } from 'react'
|
import React, { Fragment, useState } from 'react'
|
||||||
import { StaticQuery, graphql } from 'gatsby'
|
import { StaticQuery, graphql } from 'gatsby'
|
||||||
|
|
||||||
import { Quickstart, QS } from '../components/quickstart'
|
import { Quickstart, QS } from '../components/quickstart'
|
||||||
|
|
||||||
|
const DEFAULT_LANG = 'en'
|
||||||
|
const DEFAULT_OPT = 'efficiency'
|
||||||
|
|
||||||
const data = [
|
const data = [
|
||||||
{
|
{
|
||||||
id: 'lang',
|
id: 'lang',
|
||||||
title: 'Language',
|
title: 'Language',
|
||||||
|
defaultValue: DEFAULT_LANG,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
id: 'load',
|
id: 'load',
|
||||||
|
@ -25,6 +29,16 @@ const data = [
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
id: 'optimize',
|
||||||
|
title: 'Optimize for',
|
||||||
|
help:
|
||||||
|
'Optimize for efficiency (faster & smaller model) or higher accuracy (larger & slower model)',
|
||||||
|
options: [
|
||||||
|
{ id: 'efficiency', title: 'efficiency', checked: DEFAULT_OPT === 'efficiency' },
|
||||||
|
{ id: 'accuracy', title: 'accuracy', checked: DEFAULT_OPT === 'accuracy' },
|
||||||
|
],
|
||||||
|
},
|
||||||
{
|
{
|
||||||
id: 'config',
|
id: 'config',
|
||||||
title: 'Options',
|
title: 'Options',
|
||||||
|
@ -33,57 +47,73 @@ const data = [
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
const QuickstartInstall = ({ id, title, description, defaultLang = 'en', children }) => (
|
const QuickstartInstall = ({ id, title, description, children }) => {
|
||||||
<StaticQuery
|
const [lang, setLang] = useState(DEFAULT_LANG)
|
||||||
query={query}
|
const [efficiency, setEfficiency] = useState(DEFAULT_OPT)
|
||||||
render={({ site }) => {
|
const setters = {
|
||||||
const models = site.siteMetadata.languages.filter(({ models }) => models !== null)
|
lang: setLang,
|
||||||
data[0].options = models.map(({ code, name }) => ({
|
optimize: v => setEfficiency(v.includes('efficiency')),
|
||||||
id: code,
|
}
|
||||||
title: name,
|
return (
|
||||||
checked: code === defaultLang,
|
<StaticQuery
|
||||||
}))
|
query={query}
|
||||||
return (
|
render={({ site }) => {
|
||||||
<Quickstart data={data} title={title} id={id} description={description}>
|
const models = site.siteMetadata.languages.filter(({ models }) => models !== null)
|
||||||
{models.map(({ code, models, example }) => {
|
data[0].dropdown = models
|
||||||
const pkg = models[0]
|
.sort((a, b) => a.name.localeCompare(b.name))
|
||||||
const exampleText = example || 'No text available yet'
|
.map(({ code, name }) => ({
|
||||||
return (
|
id: code,
|
||||||
<Fragment key={code}>
|
title: name,
|
||||||
<QS lang={code}>python -m spacy download {pkg}</QS>
|
}))
|
||||||
<QS lang={code} divider />
|
return (
|
||||||
<QS lang={code} load="spacy" prompt="python">
|
<Quickstart
|
||||||
import spacy
|
data={data}
|
||||||
</QS>
|
title={title}
|
||||||
<QS lang={code} load="spacy" prompt="python">
|
id={id}
|
||||||
nlp = spacy.load("{pkg}")
|
description={description}
|
||||||
</QS>
|
setters={setters}
|
||||||
<QS lang={code} load="module" prompt="python">
|
copy={false}
|
||||||
import {pkg}
|
>
|
||||||
</QS>
|
{models.map(({ code, models, example }) => {
|
||||||
<QS lang={code} load="module" prompt="python">
|
const pkg = efficiency ? models[0] : models[models.length - 1]
|
||||||
nlp = {pkg}.load()
|
const exampleText = example || 'No text available yet'
|
||||||
</QS>
|
return lang !== code ? null : (
|
||||||
<QS lang={code} config="example" prompt="python">
|
<Fragment key={code}>
|
||||||
doc = nlp("{exampleText}")
|
<QS>python -m spacy download {pkg}</QS>
|
||||||
</QS>
|
<QS divider />
|
||||||
<QS lang={code} config="example" prompt="python">
|
<QS load="spacy" prompt="python">
|
||||||
print([
|
import spacy
|
||||||
{code === 'xx'
|
</QS>
|
||||||
? '(ent.text, ent.label) for ent in doc.ents'
|
<QS load="spacy" prompt="python">
|
||||||
: '(w.text, w.pos_) for w in doc'}
|
nlp = spacy.load("{pkg}")
|
||||||
])
|
</QS>
|
||||||
</QS>
|
<QS load="module" prompt="python">
|
||||||
</Fragment>
|
import {pkg}
|
||||||
)
|
</QS>
|
||||||
})}
|
<QS load="module" prompt="python">
|
||||||
|
nlp = {pkg}.load()
|
||||||
|
</QS>
|
||||||
|
<QS config="example" prompt="python">
|
||||||
|
doc = nlp("{exampleText}")
|
||||||
|
</QS>
|
||||||
|
<QS config="example" prompt="python">
|
||||||
|
print([
|
||||||
|
{code === 'xx'
|
||||||
|
? '(ent.text, ent.label) for ent in doc.ents'
|
||||||
|
: '(w.text, w.pos_) for w in doc'}
|
||||||
|
])
|
||||||
|
</QS>
|
||||||
|
</Fragment>
|
||||||
|
)
|
||||||
|
})}
|
||||||
|
|
||||||
{children}
|
{children}
|
||||||
</Quickstart>
|
</Quickstart>
|
||||||
)
|
)
|
||||||
}}
|
}}
|
||||||
/>
|
/>
|
||||||
)
|
)
|
||||||
|
}
|
||||||
|
|
||||||
export default QuickstartInstall
|
export default QuickstartInstall
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user