From 1d01d89b79773b04b8857354010d48717fe52673 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 7 Aug 2020 14:40:58 +0200 Subject: [PATCH 01/10] Update CLI docs and evaluate command [ci skip] --- spacy/cli/evaluate.py | 21 +++++++++++++-------- website/docs/api/cli.md | 30 ++++++++++++++++-------------- 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 5b434ee32..cf77fecfd 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -17,23 +17,28 @@ from .. import displacy def evaluate_cli( # fmt: off model: str = Arg(..., help="Model name or path"), - data_path: Path = Arg(..., help="Location of JSON-formatted evaluation data", exists=True), + data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True), output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False), - gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"), + use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"), # fmt: on ): """ - Evaluate a model. To render a sample of parses in a HTML file, set an - output directory as the displacy_path argument. + Evaluate a model. Expects a loadable spaCy model and evaluation data in the + binary .spacy format. The --gold-preproc option sets up the evaluation + examples with gold-standard sentences and tokens for the predictions. Gold + preprocessing helps the annotations align to the tokenization, and may + result in sequences of more consistent length. However, it may reduce + runtime accuracy due to train/test skew. To render a sample of dependency + parses in a HTML file, set as output directory as the displacy_path argument. """ evaluate( model, data_path, output=output, - gpu_id=gpu_id, + use_gpu=use_gpu, gold_preproc=gold_preproc, displacy_path=displacy_path, displacy_limit=displacy_limit, @@ -45,7 +50,7 @@ def evaluate( model: str, data_path: Path, output: Optional[Path] = None, - gpu_id: int = -1, + use_gpu: int = -1, gold_preproc: bool = False, displacy_path: Optional[Path] = None, displacy_limit: int = 25, @@ -53,8 +58,8 @@ def evaluate( ) -> Scorer: msg = Printer(no_print=silent, pretty=not silent) fix_random_seed() - if gpu_id >= 0: - require_gpu(gpu_id) + if use_gpu >= 0: + require_gpu(use_gpu) util.set_env_log(False) data_path = util.ensure_path(data_path) output_path = util.ensure_path(output) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 3971dfb79..377b2456f 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -132,7 +132,7 @@ $ python -m spacy init config [output] [--base] [--lang] [--model] [--pipeline] | `--base`, `-b` | option | Optional base config file to auto-fill with defaults. | | `--lang`, `-l` | option | Optional language code to use for blank config. If a `--pipeline` is specified, the components will be added in order. | | `--model`, `-m` | option | Optional base model to copy config from. If a `--pipeline` is specified, only those components will be kept, and all other components not in the model will be added. | -| `--pipeline`, `-p` | option | Optional comma-separated pipeline of components to add to blank language or model. | +| `--pipeline`, `-p` | option | Optional comma-separated pipeline of components to add to blank language or model. | | **CREATES** | config | Complete and auto-filled config file for training. | ### init model {#init-model new="2"} @@ -446,7 +446,8 @@ Debug a Thinc [`Model`](https://thinc.ai/docs/api-model) by running it on a sample text and checking how it updates its internal weights and parameters. ```bash -$ python -m spacy debug model [config_path] [component] [--layers] [-DIM] [-PAR] [-GRAD] [-ATTR] [-P0] [-P1] [-P2] [P3] [--gpu_id] +$ python -m spacy debug model [config_path] [component] [--layers] [-DIM] +[-PAR] [-GRAD] [-ATTR] [-P0] [-P1] [-P2] [P3] [--gpu-id] ``` @@ -641,18 +642,19 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path] ## Evaluate {#evaluate new="2"} - - -Evaluate a model's accuracy and speed on JSON-formatted annotated data. Will -print the results and optionally export -[displaCy visualizations](/usage/visualizers) of a sample set of parses to -`.html` files. Visualizations for the dependency parse and NER will be exported -as separate files if the respective component is present in the model's -pipeline. +Evaluate a model. Expects a loadable spaCy model and evaluation data in the +[binary `.spacy` format](/api/data-formats#binary-training). The +`--gold-preproc` option sets up the evaluation examples with gold-standard +sentences and tokens for the predictions. Gold preprocessing helps the +annotations align to the tokenization, and may result in sequences of more +consistent length. However, it may reduce runtime accuracy due to train/test +skew. To render a sample of dependency parses in a HTML file using the +[displaCy visualizations](/usage/visualizers), set as output directory as the +`--displacy-path` argument. ```bash -$ python -m spacy evaluate [model] [data_path] [--output] [--displacy-path] -[--displacy-limit] [--gpu-id] [--gold-preproc] +$ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc] +[--gpu-id] [--displacy-path] [--displacy-limit] ``` | Argument | Type | Description | @@ -660,10 +662,10 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--displacy-path] | `model` | positional | Model to evaluate. Can be a package or a path to a model data directory. | | `data_path` | positional | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). | | `--output`, `-o` | option | Output JSON file for metrics. If not set, no metrics will be exported. | +| `--gold-preproc`, `-G` | flag | Use gold preprocessing. | +| `--gpu-id`, `-g` | option | GPU to use, if any. Defaults to `-1` for CPU. | | `--displacy-path`, `-dp` | option | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. | | `--displacy-limit`, `-dl` | option | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. | -| `--gpu-id`, `-g` | option | GPU to use, if any. Defaults to `-1` for CPU. | -| `--gold-preproc`, `-G` | flag | Use gold preprocessing. | | **CREATES** | `stdout`, JSON, HTML | Training results and optional metrics and visualizations. | ## Package {#package} From 955d7b1b6b14b87966be4d4da41e31d05942dc9e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 7 Aug 2020 14:41:35 +0200 Subject: [PATCH 02/10] Update to latest Thinc --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 935b221d8..d4aa25943 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.0a22,<8.0.0a30", + "thinc>=8.0.0a23,<8.0.0a30", "blis>=0.4.0,<0.5.0", "pytokenizations", "smart_open>=2.0.0,<3.0.0" diff --git a/requirements.txt b/requirements.txt index a082f4b6e..4bb62742d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.0a22,<8.0.0a30 +thinc>=8.0.0a23,<8.0.0a30 blis>=0.4.0,<0.5.0 ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index 249dc9827..f9da1adb9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,13 +34,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.0a22,<8.0.0a30 + thinc>=8.0.0a23,<8.0.0a30 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0a22,<8.0.0a30 + thinc>=8.0.0a23,<8.0.0a30 blis>=0.4.0,<0.5.0 wasabi>=0.7.1,<1.1.0 srsly>=2.1.0,<3.0.0 From a8404c35175105f49f82ce37891d12ebf12ece91 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 7 Aug 2020 14:43:47 +0200 Subject: [PATCH 03/10] validation -> validate --- spacy/pipeline/entityruler.py | 6 +++--- website/docs/api/attributeruler.md | 4 ++-- website/docs/api/entityruler.md | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 8f280547e..bef97ec46 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -20,7 +20,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] assigns=["doc.ents", "token.ent_type", "token.ent_iob"], default_config={ "phrase_matcher_attr": None, - "validation": False, + "validate": False, "overwrite_ents": False, "ent_id_sep": DEFAULT_ENT_ID_SEP, }, @@ -31,7 +31,7 @@ def make_entity_ruler( nlp: Language, name: str, phrase_matcher_attr: Optional[Union[int, str]], - validation: bool, + validate: bool, overwrite_ents: bool, ent_id_sep: str, ): @@ -39,7 +39,7 @@ def make_entity_ruler( nlp, name, phrase_matcher_attr=phrase_matcher_attr, - validate=validation, + validate=validate, overwrite_ents=overwrite_ents, ent_id_sep=ent_id_sep, ) diff --git a/website/docs/api/attributeruler.md b/website/docs/api/attributeruler.md index 31c9ed04f..7c4655bc5 100644 --- a/website/docs/api/attributeruler.md +++ b/website/docs/api/attributeruler.md @@ -25,8 +25,8 @@ how the component should be configured. You can override its settings via the > > ```python > config = { -> "validation": True, > "pattern_dicts": None, +> "validate": True, > } > nlp.add_pipe("attribute_ruler", config=config) > ``` @@ -34,7 +34,7 @@ how the component should be configured. You can override its settings via the | Setting | Type | Description | Default | | --------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------- | ------- | | `pattern_dicts` | `Iterable[dict]` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](#add) (`patterns`/`attrs`/`index`) to add as patterns. | `None` | -| `validation` | bool | Whether patterns should be validated, passed to `Matcher` as `validate`. | `False` | +| `validate` | bool | Whether patterns should be validated (passed to the `Matcher`). | `False` | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/attributeruler.py diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index 0b4369ec4..1b98a659d 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -27,7 +27,7 @@ how the component should be configured. You can override its settings via the > ```python > config = { > "phrase_matcher_attr": None, -> "validation": True, +> "validate": True, > "overwrite_ents": False, > "ent_id_sep": "||", > } @@ -37,7 +37,7 @@ how the component should be configured. You can override its settings via the | Setting | Type | Description | Default | | --------------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------- | ------- | | `phrase_matcher_attr` | str | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. | `None` | -| `validation` | bool | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. | `False` | +| `validate` | bool | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). | `False` | | `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. | `False` | | `ent_id_sep` | str | Separator used internally for entity IDs. | `"||"` | From fc9a4fe8278d11f8078be991fa11c090910bb473 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 7 Aug 2020 14:43:55 +0200 Subject: [PATCH 04/10] Update attribute ruler --- spacy/pipeline/attributeruler.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index 42a505025..aba76664c 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -17,13 +17,18 @@ MatcherPatternType = List[Dict[Union[int, str], Any]] AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]] -@Language.factory("attribute_ruler") +@Language.factory( + "attribute_ruler", default_config={"pattern_dicts": None, "validate": False} +) def make_attribute_ruler( nlp: Language, name: str, - pattern_dicts: Optional[Iterable[AttributeRulerPatternType]] = None, + pattern_dicts: Optional[Iterable[AttributeRulerPatternType]], + validate: bool, ): - return AttributeRuler(nlp.vocab, name, pattern_dicts=pattern_dicts) + return AttributeRuler( + nlp.vocab, name, pattern_dicts=pattern_dicts, validate=validate + ) class AttributeRuler(Pipe): @@ -39,6 +44,7 @@ class AttributeRuler(Pipe): name: str = "attribute_ruler", *, pattern_dicts: Optional[Iterable[AttributeRulerPatternType]] = None, + validate: bool = False, ) -> None: """Initialize the AttributeRuler. @@ -54,7 +60,7 @@ class AttributeRuler(Pipe): """ self.name = name self.vocab = vocab - self.matcher = Matcher(self.vocab) + self.matcher = Matcher(self.vocab, validate=validate) self.attrs = [] self._attrs_unnormed = [] # store for reference self.indices = [] From 4aecccf153ec6507d4e9c5364e5159de3d1f6d28 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 7 Aug 2020 15:17:25 +0200 Subject: [PATCH 05/10] Update API docs for AttributeRuler.__init__ --- website/docs/api/attributeruler.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/api/attributeruler.md b/website/docs/api/attributeruler.md index 7c4655bc5..e2f009cad 100644 --- a/website/docs/api/attributeruler.md +++ b/website/docs/api/attributeruler.md @@ -65,8 +65,8 @@ pattern_dicts = \[ | `vocab` | `Vocab` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. | | `name` | str | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. | | _keyword-only_ | | | -| `pattern_dicts` | `Iterable[Dict]]` | Optional patterns to load in on initialization. | -| `validate` | bool | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. | +| `pattern_dicts` | `Iterable[Dict]]` | Optional patterns to load in on initialization. Defaults to `None`. | +| `validate` | bool | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. | ## AttributeRuler.\_\_call\_\_ {#call tag="method"} From e962784531478841f902494f0f336104ba8a9a18 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 7 Aug 2020 15:27:13 +0200 Subject: [PATCH 06/10] Add Lemmatizer and simplify related components (#5848) * Add Lemmatizer and simplify related components * Add `Lemmatizer` pipe with `lookup` and `rule` modes using the `Lookups` tables. * Reduce `Tagger` to a simple tagger that sets `Token.tag` (no pos or lemma) * Reduce `Morphology` to only keep track of morph tags (no tag map, lemmatizer, or morph rules) * Remove lemmatizer from `Vocab` * Adjust many many tests Differences: * No default lookup lemmas * No special treatment of TAG in `from_array` and similar required * Easier to modify labels in a `Tagger` * No extra strings added from morphology / tag map * Fix test * Initial fix for Lemmatizer config/serialization * Adjust init test to be more generic * Adjust init test to force empty Lookups * Add simple cache to rule-based lemmatizer * Convert language-specific lemmatizers Convert language-specific lemmatizers to component lemmatizers. Remove previous lemmatizer class. * Fix French and Polish lemmatizers * Remove outdated UPOS conversions * Update Russian lemmatizer init in tests * Add minimal init/run tests for custom lemmatizers * Add option to overwrite existing lemmas * Update mode setting, lookup loading, and caching * Make `mode` an immutable property * Only enforce strict `load_lookups` for known supported modes * Move caching into individual `_lemmatize` methods * Implement strict when lang is not found in lookups * Fix tables/lookups in make_lemmatizer * Reallow provided lookups and allow for stricter checks * Add lookups asset to all Lemmatizer pipe tests * Rename lookups in lemmatizer init test * Clean up merge * Refactor lookup table loading * Add helper from `load_lemmatizer_lookups` that loads required and optional lookups tables based on settings provided by a config. Additional slight refactor of lookups: * Add `Lookups.set_table` to set a table from a provided `Table` * Reorder class definitions to be able to specify type as `Table` * Move registry assets into test methods * Refactor lookups tables config Use class methods within `Lemmatizer` to provide the config for particular modes and to load the lookups from a config. * Add pipe and score to lemmatizer * Simplify Tagger.score * Add missing import * Clean up imports and auto-format * Remove unused kwarg * Tidy up and auto-format * Update docstrings for Lemmatizer Update docstrings for Lemmatizer. Additionally modify `is_base_form` API to take `Token` instead of individual features. * Update docstrings * Remove tag map values from Tagger.add_label * Update API docs * Fix relative link in Lemmatizer API docs --- spacy/default_config.cfg | 3 - spacy/errors.py | 7 +- spacy/lang/el/__init__.py | 47 ++- spacy/lang/el/lemmatizer.py | 37 +- spacy/lang/en/__init__.py | 49 ++- spacy/lang/en/lemmatizer.py | 69 ++-- spacy/lang/fr/__init__.py | 48 ++- spacy/lang/fr/lemmatizer.py | 141 +++----- spacy/lang/ja/__init__.py | 2 - spacy/lang/ko/__init__.py | 4 +- spacy/lang/nl/__init__.py | 46 ++- spacy/lang/nl/lemmatizer.py | 183 +++++----- spacy/lang/pl/__init__.py | 51 ++- spacy/lang/pl/lemmatizer.py | 48 +-- spacy/lang/ru/__init__.py | 41 +-- spacy/lang/ru/lemmatizer.py | 52 ++- spacy/lang/uk/__init__.py | 43 +-- spacy/lang/uk/lemmatizer.py | 195 +---------- spacy/language.py | 26 -- spacy/lemmatizer.py | 145 -------- spacy/lookups.py | 304 ++++++++-------- spacy/morphology.pxd | 6 - spacy/morphology.pyx | 151 +------- spacy/pipeline/__init__.py | 4 +- spacy/pipeline/lemmatizer.py | 330 ++++++++++++++++++ spacy/pipeline/tagger.pyx | 126 +------ spacy/schemas.py | 1 - spacy/tests/conftest.py | 2 +- spacy/tests/doc/test_creation.py | 21 +- spacy/tests/doc/test_morphanalysis.py | 20 +- spacy/tests/doc/test_retokenize_merge.py | 3 - spacy/tests/lang/en/test_tagger.py | 21 -- spacy/tests/lang/ru/test_lemmatizer.py | 69 ++-- spacy/tests/lang/test_lemmatizers.py | 34 ++ spacy/tests/morphology/test_morph_features.py | 5 +- spacy/tests/morphology/test_morph_pickle.py | 15 +- spacy/tests/parser/test_parse.py | 4 +- spacy/tests/pipeline/test_lemmatizer.py | 109 ++++++ spacy/tests/pipeline/test_tagger.py | 16 +- spacy/tests/regression/test_issue1-1000.py | 8 +- spacy/tests/regression/test_issue1001-1500.py | 16 +- spacy/tests/regression/test_issue1501-2000.py | 2 - spacy/tests/regression/test_issue2501-3000.py | 4 +- spacy/tests/regression/test_issue3001-3500.py | 4 +- spacy/tests/regression/test_issue3501-4000.py | 4 +- spacy/tests/regression/test_issue4001-4500.py | 1 + spacy/tests/regression/test_issue5230.py | 3 +- .../serialize/test_serialize_pipeline.py | 8 +- .../serialize/test_serialize_vocab_strings.py | 21 +- spacy/tests/test_lemmatizer.py | 64 ---- spacy/tests/tokenizer/test_tokenizer.py | 7 +- spacy/tokens/_retokenize.pyx | 6 +- spacy/tokens/doc.pyx | 18 +- spacy/tokens/token.pyx | 13 +- spacy/vocab.pyx | 22 +- website/docs/api/lemmatizer.md | 279 +++++++++++---- website/docs/api/morphology.md | 59 +--- website/docs/api/tagger.md | 29 +- website/docs/api/vocab.md | 2 - 59 files changed, 1439 insertions(+), 1609 deletions(-) delete mode 100644 spacy/lemmatizer.py create mode 100644 spacy/pipeline/lemmatizer.py delete mode 100644 spacy/tests/lang/en/test_tagger.py create mode 100644 spacy/tests/lang/test_lemmatizers.py create mode 100644 spacy/tests/pipeline/test_lemmatizer.py delete mode 100644 spacy/tests/test_lemmatizer.py diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 353924280..8aadad668 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -19,9 +19,6 @@ after_pipeline_creation = null [nlp.tokenizer] @tokenizers = "spacy.Tokenizer.v1" -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - [components] # Training hyper-parameters and additional features. diff --git a/spacy/errors.py b/spacy/errors.py index 7f47dd332..8e9a8d4b4 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -510,7 +510,7 @@ class Errors: E952 = ("The section '{name}' is not a valid section in the provided config.") E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E954 = ("The Tok2Vec listener did not receive a valid input.") - E955 = ("Can't find table '{table}' for language '{lang}' in spacy-lookups-data.") + E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.") E956 = ("Can't find component '{name}' in [components] block in the config. " "Available components: {opts}") E957 = ("Writing directly to Language.factories isn't needed anymore in " @@ -633,6 +633,11 @@ class Errors: E1001 = ("Target token outside of matched span for match with tokens " "'{span}' and offset '{index}' matched by patterns '{patterns}'.") E1002 = ("Span index out of range.") + E1003 = ("Unsupported lemmatizer mode '{mode}'.") + E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. " + "Required tables '{tables}', found '{found}'. If you are not " + "providing custom lookups, make sure you have the package " + "spacy-lookups-data installed.") @add_codes diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index c766c375e..0c5e0672b 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -1,38 +1,17 @@ -from typing import Callable -from thinc.api import Config +from typing import Optional +from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from .lemmatizer import GreekLemmatizer from .syntax_iterators import SYNTAX_ITERATORS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES -from ...lookups import load_lookups +from .lemmatizer import GreekLemmatizer +from ...lookups import Lookups from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] - -[nlp.lemmatizer] -@lemmatizers = "spacy.el.GreekLemmatizer" -""" - - -@registry.lemmatizers("spacy.el.GreekLemmatizer") -def create_lemmatizer() -> Callable[[Language], GreekLemmatizer]: - tables = ["lemma_index", "lemma_exc", "lemma_rules"] - - def lemmatizer_factory(nlp: Language) -> GreekLemmatizer: - lookups = load_lookups(lang=nlp.lang, tables=tables) - return GreekLemmatizer(lookups=lookups) - - return lemmatizer_factory class GreekDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES @@ -47,4 +26,22 @@ class Greek(Language): Defaults = GreekDefaults +@Greek.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "rule", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + lookups = GreekLemmatizer.load_lookups(nlp.lang, mode, lookups) + return GreekLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["Greek"] diff --git a/spacy/lang/el/lemmatizer.py b/spacy/lang/el/lemmatizer.py index 809a23485..a049601dc 100644 --- a/spacy/lang/el/lemmatizer.py +++ b/spacy/lang/el/lemmatizer.py @@ -1,6 +1,7 @@ -from typing import Dict, List +from typing import List -from ...lemmatizer import Lemmatizer +from ...pipeline import Lemmatizer +from ...tokens import Token class GreekLemmatizer(Lemmatizer): @@ -14,13 +15,27 @@ class GreekLemmatizer(Lemmatizer): not applicable for Greek language. """ - def lemmatize( - self, - string: str, - index: Dict[str, List[str]], - exceptions: Dict[str, Dict[str, List[str]]], - rules: Dict[str, List[List[str]]], - ) -> List[str]: + def rule_lemmatize(self, token: Token) -> List[str]: + """Lemmatize using a rule-based approach. + + token (Token): The token to lemmatize. + RETURNS (list): The available lemmas for the string. + """ + cache_key = (token.lower, token.pos) + if cache_key in self.cache: + return self.cache[cache_key] + string = token.text + univ_pos = token.pos_.lower() + if univ_pos in ("", "eol", "space"): + return [string.lower()] + + index_table = self.lookups.get_table("lemma_index", {}) + exc_table = self.lookups.get_table("lemma_exc", {}) + rules_table = self.lookups.get_table("lemma_rules", {}) + index = index_table.get(univ_pos, {}) + exceptions = exc_table.get(univ_pos, {}) + rules = rules_table.get(univ_pos, {}) + string = string.lower() forms = [] if string in index: @@ -42,4 +57,6 @@ class GreekLemmatizer(Lemmatizer): forms.extend(oov_forms) if not forms: forms.append(string) - return list(set(forms)) + forms = list(set(forms)) + self.cache[cache_key] = forms + return forms diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 81200da27..1a595b6e7 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -1,39 +1,18 @@ -from typing import Callable -from thinc.api import Config +from typing import Optional + +from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS -from .lemmatizer import is_base_form from .punctuation import TOKENIZER_INFIXES +from .lemmatizer import EnglishLemmatizer from ...language import Language -from ...lemmatizer import Lemmatizer -from ...lookups import load_lookups -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] - -[nlp.lemmatizer] -@lemmatizers = "spacy.en.EnglishLemmatizer" -""" - - -@registry.lemmatizers("spacy.en.EnglishLemmatizer") -def create_lemmatizer() -> Callable[[Language], Lemmatizer]: - tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] - - def lemmatizer_factory(nlp: Language) -> Lemmatizer: - lookups = load_lookups(lang=nlp.lang, tables=tables) - return Lemmatizer(lookups=lookups, is_base_form=is_base_form) - - return lemmatizer_factory +from ...lookups import Lookups class EnglishDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES lex_attr_getters = LEX_ATTRS @@ -46,4 +25,22 @@ class English(Language): Defaults = EnglishDefaults +@English.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "rule", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + lookups = EnglishLemmatizer.load_lookups(nlp.lang, mode, lookups) + return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["English"] diff --git a/spacy/lang/en/lemmatizer.py b/spacy/lang/en/lemmatizer.py index 6d5db9e1e..b8bef39b9 100644 --- a/spacy/lang/en/lemmatizer.py +++ b/spacy/lang/en/lemmatizer.py @@ -1,36 +1,43 @@ from typing import Optional +from ...pipeline import Lemmatizer +from ...tokens import Token -def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool: - """ - Check whether we're dealing with an uninflected paradigm, so we can - avoid lemmatization entirely. - univ_pos (unicode / int): The token's universal part-of-speech tag. - morphology (dict): The token's morphological features following the - Universal Dependencies scheme. +class EnglishLemmatizer(Lemmatizer): + """English lemmatizer. Only overrides is_base_form. """ - if morphology is None: - morphology = {} - if univ_pos == "noun" and morphology.get("Number") == "sing": - return True - elif univ_pos == "verb" and morphology.get("VerbForm") == "inf": - return True - # This maps 'VBP' to base form -- probably just need 'IS_BASE' - # morphology - elif univ_pos == "verb" and ( - morphology.get("VerbForm") == "fin" - and morphology.get("Tense") == "pres" - and morphology.get("Number") is None - ): - return True - elif univ_pos == "adj" and morphology.get("Degree") == "pos": - return True - elif morphology.get("VerbForm") == "inf": - return True - elif morphology.get("VerbForm") == "none": - return True - elif morphology.get("Degree") == "pos": - return True - else: - return False + + def is_base_form(self, token: Token) -> bool: + """ + Check whether we're dealing with an uninflected paradigm, so we can + avoid lemmatization entirely. + + univ_pos (unicode / int): The token's universal part-of-speech tag. + morphology (dict): The token's morphological features following the + Universal Dependencies scheme. + """ + univ_pos = token.pos_.lower() + morphology = token.morph.to_dict() + if univ_pos == "noun" and morphology.get("Number") == "Sing": + return True + elif univ_pos == "verb" and morphology.get("VerbForm") == "Inf": + return True + # This maps 'VBP' to base form -- probably just need 'IS_BASE' + # morphology + elif univ_pos == "verb" and ( + morphology.get("VerbForm") == "Fin" + and morphology.get("Tense") == "Pres" + and morphology.get("Number") is None + ): + return True + elif univ_pos == "adj" and morphology.get("Degree") == "Pos": + return True + elif morphology.get("VerbForm") == "Inf": + return True + elif morphology.get("VerbForm") == "None": + return True + elif morphology.get("Degree") == "Pos": + return True + else: + return False diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index a5350d422..42241cd8a 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -1,5 +1,6 @@ -from typing import Callable -from thinc.api import Config +from typing import Optional + +from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES @@ -7,33 +8,12 @@ from .punctuation import TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS -from .lemmatizer import FrenchLemmatizer, is_base_form -from ...lookups import load_lookups +from .lemmatizer import FrenchLemmatizer +from ...lookups import Lookups from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] - -[nlp.lemmatizer] -@lemmatizers = "spacy.fr.FrenchLemmatizer" -""" - - -@registry.lemmatizers("spacy.fr.FrenchLemmatizer") -def create_lemmatizer() -> Callable[[Language], FrenchLemmatizer]: - tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] - - def lemmatizer_factory(nlp: Language) -> FrenchLemmatizer: - lookups = load_lookups(lang=nlp.lang, tables=tables) - return FrenchLemmatizer(lookups=lookups, is_base_form=is_base_form) - - return lemmatizer_factory class FrenchDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES @@ -49,4 +29,22 @@ class French(Language): Defaults = FrenchDefaults +@French.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "rule", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + lookups = FrenchLemmatizer.load_lookups(nlp.lang, mode, lookups) + return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["French"] diff --git a/spacy/lang/fr/lemmatizer.py b/spacy/lang/fr/lemmatizer.py index e46ec1682..0dd782cc4 100644 --- a/spacy/lang/fr/lemmatizer.py +++ b/spacy/lang/fr/lemmatizer.py @@ -1,8 +1,7 @@ -from typing import Optional, List, Dict +from typing import List, Dict -from ...lemmatizer import Lemmatizer -from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP -from ...symbols import SCONJ, CCONJ +from ...pipeline import Lemmatizer +from ...tokens import Token class FrenchLemmatizer(Lemmatizer): @@ -15,65 +14,55 @@ class FrenchLemmatizer(Lemmatizer): the lookup table. """ - def __call__( - self, string: str, univ_pos: str, morphology: Optional[dict] = None - ) -> List[str]: - lookup_table = self.lookups.get_table("lemma_lookup", {}) - if "lemma_rules" not in self.lookups: - return [lookup_table.get(string, string)] - if univ_pos in (NOUN, "NOUN", "noun"): - univ_pos = "noun" - elif univ_pos in (VERB, "VERB", "verb"): - univ_pos = "verb" - elif univ_pos in (ADJ, "ADJ", "adj"): - univ_pos = "adj" - elif univ_pos in (ADP, "ADP", "adp"): - univ_pos = "adp" - elif univ_pos in (ADV, "ADV", "adv"): - univ_pos = "adv" - elif univ_pos in (AUX, "AUX", "aux"): - univ_pos = "aux" - elif univ_pos in (CCONJ, "CCONJ", "cconj"): - univ_pos = "cconj" - elif univ_pos in (DET, "DET", "det"): - univ_pos = "det" - elif univ_pos in (PRON, "PRON", "pron"): - univ_pos = "pron" - elif univ_pos in (PUNCT, "PUNCT", "punct"): - univ_pos = "punct" - elif univ_pos in (SCONJ, "SCONJ", "sconj"): - univ_pos = "sconj" + @classmethod + def get_lookups_config(cls, mode: str) -> Dict: + if mode == "rule": + return { + "required_tables": [ + "lemma_lookup", + "lemma_rules", + "lemma_exc", + "lemma_index", + ], + "optional_tables": [], + } else: - return [self.lookup(string)] + return super().get_lookups_config(mode) + + def rule_lemmatize(self, token: Token) -> List[str]: + cache_key = (token.orth, token.pos) + if cache_key in self.cache: + return self.cache[cache_key] + string = token.text + univ_pos = token.pos_.lower() + if univ_pos in ("", "eol", "space"): + return [string.lower()] + elif "lemma_rules" not in self.lookups or univ_pos not in ( + "noun", + "verb", + "adj", + "adp", + "adv", + "aux", + "cconj", + "det", + "pron", + "punct", + "sconj", + ): + return self.lookup_lemmatize(token) index_table = self.lookups.get_table("lemma_index", {}) exc_table = self.lookups.get_table("lemma_exc", {}) rules_table = self.lookups.get_table("lemma_rules", {}) - lemmas = self.lemmatize( - string, - index_table.get(univ_pos, {}), - exc_table.get(univ_pos, {}), - rules_table.get(univ_pos, []), - ) - return lemmas - - def lookup(self, string: str, orth: Optional[int] = None) -> str: - lookup_table = self.lookups.get_table("lemma_lookup", {}) - if orth is not None and orth in lookup_table: - return lookup_table[orth][0] - return string - - def lemmatize( - self, - string: str, - index: Dict[str, List[str]], - exceptions: Dict[str, Dict[str, List[str]]], - rules: Dict[str, List[List[str]]], - ) -> List[str]: lookup_table = self.lookups.get_table("lemma_lookup", {}) + index = index_table.get(univ_pos, {}) + exceptions = exc_table.get(univ_pos, {}) + rules = rules_table.get(univ_pos, []) string = string.lower() forms = [] if string in index: forms.append(string) + self.cache[cache_key] = forms return forms forms.extend(exceptions.get(string, [])) oov_forms = [] @@ -90,45 +79,9 @@ class FrenchLemmatizer(Lemmatizer): if not forms: forms.extend(oov_forms) if not forms and string in lookup_table.keys(): - forms.append(lookup_table[string][0]) + forms.append(self.lookup_lemmatize(token)[0]) if not forms: forms.append(string) - return list(set(forms)) - - -def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool: - """ - Check whether we're dealing with an uninflected paradigm, so we can - avoid lemmatization entirely. - """ - morphology = {} if morphology is None else morphology - others = [ - key - for key in morphology - if key not in (POS, "Number", "POS", "VerbForm", "Tense") - ] - if univ_pos == "noun" and morphology.get("Number") == "sing": - return True - elif univ_pos == "verb" and morphology.get("VerbForm") == "inf": - return True - # This maps 'VBP' to base form -- probably just need 'IS_BASE' - # morphology - elif univ_pos == "verb" and ( - morphology.get("VerbForm") == "fin" - and morphology.get("Tense") == "pres" - and morphology.get("Number") is None - and not others - ): - return True - elif univ_pos == "adj" and morphology.get("Degree") == "pos": - return True - elif "VerbForm=inf" in morphology: - return True - elif "VerbForm=none" in morphology: - return True - elif "Number=sing" in morphology: - return True - elif "Degree=pos" in morphology: - return True - else: - return False + forms = list(set(forms)) + self.cache[cache_key] = forms + return forms diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 900db4e4c..051415455 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -38,8 +38,6 @@ def create_tokenizer(split_mode: Optional[str] = None): class JapaneseTokenizer(DummyTokenizer): def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None: self.vocab = nlp.vocab - # TODO: is this the right way to do it? - self.vocab.morphology.load_tag_map(TAG_MAP) self.split_mode = split_mode self.tokenizer = try_sudachi_import(self.split_mode) diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index f2954f461..47a3887a6 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -7,6 +7,7 @@ from .lex_attrs import LEX_ATTRS from ...language import Language from ...tokens import Doc from ...compat import copy_reg +from ...symbols import POS from ...util import DummyTokenizer, registry @@ -29,8 +30,6 @@ def create_tokenizer(): class KoreanTokenizer(DummyTokenizer): def __init__(self, nlp: Optional[Language] = None): self.vocab = nlp.vocab - # TODO: is this the right way to do it? - self.vocab.morphology.load_tag_map(TAG_MAP) MeCab = try_mecab_import() self.mecab_tokenizer = MeCab("-F%f[0],%f[7]") @@ -44,6 +43,7 @@ class KoreanTokenizer(DummyTokenizer): for token, dtoken in zip(doc, dtokens): first_tag, sep, eomi_tags = dtoken["tag"].partition("+") token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미) + token.pos = TAG_MAP[token.tag_][POS] token.lemma_ = dtoken["lemma"] doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens] return doc diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index d874ef7a1..1526e41f5 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -1,5 +1,6 @@ -from typing import Callable -from thinc.api import Config +from typing import Optional + +from thinc.api import Model from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS @@ -7,32 +8,11 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES from .lemmatizer import DutchLemmatizer -from ...lookups import load_lookups +from ...lookups import Lookups from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] - -[nlp.lemmatizer] -@lemmatizers = "spacy.nl.DutchLemmatizer" -""" - - -@registry.lemmatizers("spacy.nl.DutchLemmatizer") -def create_lemmatizer() -> Callable[[Language], DutchLemmatizer]: - tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] - - def lemmatizer_factory(nlp: Language) -> DutchLemmatizer: - lookups = load_lookups(lang=nlp.lang, tables=tables) - return DutchLemmatizer(lookups=lookups) - - return lemmatizer_factory class DutchDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES @@ -46,4 +26,22 @@ class Dutch(Language): Defaults = DutchDefaults +@Dutch.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "rule", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + lookups = DutchLemmatizer.load_lookups(nlp.lang, mode, lookups) + return DutchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["Dutch"] diff --git a/spacy/lang/nl/lemmatizer.py b/spacy/lang/nl/lemmatizer.py index b01debaa9..42b97a862 100644 --- a/spacy/lang/nl/lemmatizer.py +++ b/spacy/lang/nl/lemmatizer.py @@ -1,44 +1,34 @@ -from typing import Optional, List, Dict, Tuple +from typing import List, Dict -from ...lemmatizer import Lemmatizer -from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV +from ...pipeline import Lemmatizer +from ...tokens import Token class DutchLemmatizer(Lemmatizer): - # Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB. - univ_pos_name_variants = { - NOUN: "noun", - "NOUN": "noun", - "noun": "noun", - VERB: "verb", - "VERB": "verb", - "verb": "verb", - AUX: "verb", - "AUX": "verb", - "aux": "verb", - ADJ: "adj", - "ADJ": "adj", - "adj": "adj", - ADV: "adv", - "ADV": "adv", - "adv": "adv", - PRON: "pron", - "PRON": "pron", - "pron": "pron", - DET: "det", - "DET": "det", - "det": "det", - ADP: "adp", - "ADP": "adp", - "adp": "adp", - NUM: "num", - "NUM": "num", - "num": "num", - } + @classmethod + def get_lookups_config(cls, mode: str) -> Dict: + if mode == "rule": + return { + "required_tables": [ + "lemma_lookup", + "lemma_rules", + "lemma_exc", + "lemma_index", + ], + } + else: + return super().get_lookups_config(mode) - def __call__( - self, string: str, univ_pos: str, morphology: Optional[dict] = None - ) -> List[str]: + def lookup_lemmatize(self, token: Token) -> List[str]: + """Overrides parent method so that a lowercased version of the string + is used to search the lookup table. This is necessary because our + lookup table consists entirely of lowercase keys.""" + lookup_table = self.lookups.get_table("lemma_lookup", {}) + string = token.text.lower() + return [lookup_table.get(string, string)] + + # Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB. + def rule_lemmatize(self, token: Token) -> List[str]: # Difference 1: self.rules is assumed to be non-None, so no # 'is None' check required. # String lowercased from the get-go. All lemmatization results in @@ -46,74 +36,61 @@ class DutchLemmatizer(Lemmatizer): # any problems, and it keeps the exceptions indexes small. If this # creates problems for proper nouns, we can introduce a check for # univ_pos == "PROPN". - string = string.lower() - try: - univ_pos = self.univ_pos_name_variants[univ_pos] - except KeyError: - # Because PROPN not in self.univ_pos_name_variants, proper names - # are not lemmatized. They are lowercased, however. - return [string] - # if string in self.lemma_index.get(univ_pos) + cache_key = (token.lower, token.pos) + if cache_key in self.cache: + return self.cache[cache_key] + string = token.text + univ_pos = token.pos_.lower() + if univ_pos in ("", "eol", "space"): + forms = [string.lower()] + self.cache[cache_key] = forms + return forms + index_table = self.lookups.get_table("lemma_index", {}) + exc_table = self.lookups.get_table("lemma_exc", {}) + rules_table = self.lookups.get_table("lemma_rules", {}) + index = index_table.get(univ_pos, {}) + exceptions = exc_table.get(univ_pos, {}) + rules = rules_table.get(univ_pos, {}) + + string = string.lower() + if univ_pos not in ( + "noun", + "verb", + "aux", + "adj", + "adv", + "pron", + "det", + "adp", + "num", + ): + forms = [string] + self.cache[cache_key] = forms + return forms lemma_index = index_table.get(univ_pos, {}) # string is already lemma if string in lemma_index: - return [string] + forms = [string] + self.cache[cache_key] = forms + return forms exc_table = self.lookups.get_table("lemma_exc", {}) exceptions = exc_table.get(univ_pos, {}) # string is irregular token contained in exceptions index. try: - lemma = exceptions[string] - return [lemma[0]] + forms = [exceptions[string][0]] + self.cache[cache_key] = forms + return forms except KeyError: pass # string corresponds to key in lookup table lookup_table = self.lookups.get_table("lemma_lookup", {}) looked_up_lemma = lookup_table.get(string) if looked_up_lemma and looked_up_lemma in lemma_index: - return [looked_up_lemma] + forms = [looked_up_lemma] + self.cache[cache_key] = forms + return forms rules_table = self.lookups.get_table("lemma_rules", {}) - forms, is_known = self.lemmatize( - string, lemma_index, exceptions, rules_table.get(univ_pos, []) - ) - # Back-off through remaining return value candidates. - if forms: - if is_known: - return forms - else: - for form in forms: - if form in exceptions: - return [form] - if looked_up_lemma: - return [looked_up_lemma] - else: - return forms - elif looked_up_lemma: - return [looked_up_lemma] - else: - return [string] - - # Overrides parent method so that a lowercased version of the string is - # used to search the lookup table. This is necessary because our lookup - # table consists entirely of lowercase keys. - def lookup(self, string: str, orth: Optional[int] = None) -> str: - lookup_table = self.lookups.get_table("lemma_lookup", {}) - string = string.lower() - if orth is not None: - return lookup_table.get(orth, string) - else: - return lookup_table.get(string, string) - - # Reimplemented to focus more on application of suffix rules and to return - # as early as possible. - def lemmatize( - self, - string: str, - index: Dict[str, List[str]], - exceptions: Dict[str, Dict[str, List[str]]], - rules: Dict[str, List[List[str]]], - ) -> Tuple[List[str], bool]: - # returns (forms, is_known: bool) oov_forms = [] for old, new in rules: if string.endswith(old): @@ -121,7 +98,31 @@ class DutchLemmatizer(Lemmatizer): if not form: pass elif form in index: - return [form], True # True = Is known (is lemma) + forms = [form] + self.cache[cache_key] = forms + return forms else: oov_forms.append(form) - return list(set(oov_forms)), False + forms = list(set(oov_forms)) + # Back-off through remaining return value candidates. + if forms: + for form in forms: + if form in exceptions: + forms = [form] + self.cache[cache_key] = forms + return forms + if looked_up_lemma: + forms = [looked_up_lemma] + self.cache[cache_key] = forms + return forms + else: + self.cache[cache_key] = forms + return forms + elif looked_up_lemma: + forms = [looked_up_lemma] + self.cache[cache_key] = forms + return forms + else: + forms = [string] + self.cache[cache_key] = forms + return forms diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 2393f1aea..a180fa6e9 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -1,5 +1,6 @@ -from typing import Callable -from thinc.api import Config +from typing import Optional + +from thinc.api import Model from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES @@ -7,42 +8,16 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .lemmatizer import PolishLemmatizer from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...lookups import load_lookups +from ...lookups import Lookups from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] - -[nlp.lemmatizer] -@lemmatizers = "spacy.pl.PolishLemmatizer" -""" - TOKENIZER_EXCEPTIONS = { exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".") } -@registry.lemmatizers("spacy.pl.PolishLemmatizer") -def create_lemmatizer() -> Callable[[Language], PolishLemmatizer]: - # fmt: off - tables = [ - "lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv", - "lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num", - "lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb" - ] - # fmt: on - - def lemmatizer_factory(nlp: Language) -> PolishLemmatizer: - lookups = load_lookups(lang=nlp.lang, tables=tables) - return PolishLemmatizer(lookups=lookups) - - return lemmatizer_factory - - class PolishDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES @@ -56,4 +31,22 @@ class Polish(Language): Defaults = PolishDefaults +@Polish.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "lookup", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + lookups = PolishLemmatizer.load_lookups(nlp.lang, mode, lookups) + return PolishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["Polish"] diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py index 8e96dd75b..c4c6db06a 100644 --- a/spacy/lang/pl/lemmatizer.py +++ b/spacy/lang/pl/lemmatizer.py @@ -1,7 +1,7 @@ -from typing import Optional, List, Dict +from typing import List, Dict -from ...lemmatizer import Lemmatizer -from ...parts_of_speech import NAMES +from ...pipeline import Lemmatizer +from ...tokens import Token class PolishLemmatizer(Lemmatizer): @@ -9,12 +9,30 @@ class PolishLemmatizer(Lemmatizer): # dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS. # It utilizes some prefix based improvements for verb and adjectives # lemmatization, as well as case-sensitive lemmatization for nouns. - def __call__( - self, string: str, univ_pos: str, morphology: Optional[dict] = None - ) -> List[str]: - if isinstance(univ_pos, int): - univ_pos = NAMES.get(univ_pos, "X") - univ_pos = univ_pos.upper() + + @classmethod + def get_lookups_config(cls, mode: str) -> Dict: + if mode == "lookup": + return { + "required_tables": [ + "lemma_lookup_adj", + "lemma_lookup_adp", + "lemma_lookup_adv", + "lemma_lookup_aux", + "lemma_lookup_noun", + "lemma_lookup_num", + "lemma_lookup_part", + "lemma_lookup_pron", + "lemma_lookup_verb", + ] + } + else: + return super().get_lookups_config(mode) + + def lookup_lemmatize(self, token: Token) -> List[str]: + string = token.text + univ_pos = token.pos_ + morphology = token.morph.to_dict() lookup_pos = univ_pos.lower() if univ_pos == "PROPN": lookup_pos = "noun" @@ -71,15 +89,3 @@ class PolishLemmatizer(Lemmatizer): return [lookup_table[string]] return [string.lower()] return [lookup_table.get(string, string)] - - def lookup(self, string: str, orth: Optional[int] = None) -> str: - return string.lower() - - def lemmatize( - self, - string: str, - index: Dict[str, List[str]], - exceptions: Dict[str, Dict[str, List[str]]], - rules: Dict[str, List[List[str]]], - ) -> List[str]: - raise NotImplementedError diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index 5d2333edf..be770e3ec 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -1,32 +1,16 @@ -from typing import Callable -from thinc.api import Config +from typing import Optional + +from thinc.api import Model from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS from .lemmatizer import RussianLemmatizer -from ...util import registry from ...language import Language - - -DEFAULT_CONFIG = """ -[nlp] - -[nlp.lemmatizer] -@lemmatizers = "spacy.ru.RussianLemmatizer" -""" - - -@registry.lemmatizers("spacy.ru.RussianLemmatizer") -def create_lemmatizer() -> Callable[[Language], RussianLemmatizer]: - def lemmatizer_factory(nlp: Language) -> RussianLemmatizer: - return RussianLemmatizer() - - return lemmatizer_factory +from ...lookups import Lookups class RussianDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS @@ -37,4 +21,21 @@ class Russian(Language): Defaults = RussianDefaults +@Russian.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "pymorphy2", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + return RussianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["Russian"] diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index 28767348d..8d7996c63 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -1,8 +1,12 @@ -from typing import Optional, Tuple, Dict, List +from typing import Optional, List, Dict, Tuple + +from thinc.api import Model -from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS -from ...lemmatizer import Lemmatizer from ...lookups import Lookups +from ...pipeline import Lemmatizer +from ...symbols import POS +from ...tokens import Token +from ...vocab import Vocab PUNCT_RULES = {"«": '"', "»": '"'} @@ -11,8 +15,17 @@ PUNCT_RULES = {"«": '"', "»": '"'} class RussianLemmatizer(Lemmatizer): _morph = None - def __init__(self, lookups: Optional[Lookups] = None) -> None: - super(RussianLemmatizer, self).__init__(lookups) + def __init__( + self, + vocab: Vocab, + model: Optional[Model], + name: str = "lemmatizer", + *, + mode: str = "pymorphy2", + lookups: Optional[Lookups] = None, + ) -> None: + super().__init__(vocab, model, name, mode=mode, lookups=lookups) + try: from pymorphy2 import MorphAnalyzer except ImportError: @@ -25,10 +38,10 @@ class RussianLemmatizer(Lemmatizer): if RussianLemmatizer._morph is None: RussianLemmatizer._morph = MorphAnalyzer() - def __call__( - self, string: str, univ_pos: str, morphology: Optional[dict] = None - ) -> List[str]: - univ_pos = self.normalize_univ_pos(univ_pos) + def pymorphy2_lemmatize(self, token: Token) -> List[str]: + string = token.text + univ_pos = token.pos_ + morphology = token.morph.to_dict() if univ_pos == "PUNCT": return [PUNCT_RULES.get(string, string)] if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"): @@ -81,25 +94,8 @@ class RussianLemmatizer(Lemmatizer): return [string.lower()] return list(set([analysis.normal_form for analysis in filtered_analyses])) - @staticmethod - def normalize_univ_pos(univ_pos: str) -> Optional[str]: - if isinstance(univ_pos, str): - return univ_pos.upper() - symbols_to_str = { - ADJ: "ADJ", - DET: "DET", - NOUN: "NOUN", - NUM: "NUM", - PRON: "PRON", - PROPN: "PROPN", - PUNCT: "PUNCT", - VERB: "VERB", - } - if univ_pos in symbols_to_str: - return symbols_to_str[univ_pos] - return None - - def lookup(self, string: str, orth: Optional[int] = None) -> str: + def lookup_lemmatize(self, token: Token) -> List[str]: + string = token.text analyses = self._morph.parse(string) if len(analyses) == 1: return analyses[0].normal_form diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py index 6b44a7144..e9936cf7d 100644 --- a/spacy/lang/uk/__init__.py +++ b/spacy/lang/uk/__init__.py @@ -1,32 +1,16 @@ -from typing import Callable -from thinc.api import Config +from typing import Optional + +from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ...util import registry -from ...language import Language from .lemmatizer import UkrainianLemmatizer - - -DEFAULT_CONFIG = """ -[nlp] - -[nlp.lemmatizer] -@lemmatizers = "spacy.uk.UkrainianLemmatizer" -""" - - -@registry.lemmatizers("spacy.uk.UkrainianLemmatizer") -def create_ukrainian_lemmatizer() -> Callable[[Language], UkrainianLemmatizer]: - def lemmatizer_factory(nlp: Language) -> UkrainianLemmatizer: - return UkrainianLemmatizer() - - return lemmatizer_factory +from ...language import Language +from ...lookups import Lookups class UkrainianDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS @@ -37,4 +21,21 @@ class Ukrainian(Language): Defaults = UkrainianDefaults +@Ukrainian.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "pymorphy2", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["Ukrainian"] diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index cf89d1a12..0d6febce6 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -1,187 +1,30 @@ -from typing import Optional, List, Tuple, Dict +from typing import Optional -from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS +from thinc.api import Model + +from ..ru.lemmatizer import RussianLemmatizer from ...lookups import Lookups -from ...lemmatizer import Lemmatizer +from ...vocab import Vocab -PUNCT_RULES = {"«": '"', "»": '"'} - - -class UkrainianLemmatizer(Lemmatizer): - _morph = None - - def __init__(self, lookups: Optional[Lookups] = None) -> None: - super(UkrainianLemmatizer, self).__init__(lookups) +class UkrainianLemmatizer(RussianLemmatizer): + def __init__( + self, + vocab: Vocab, + model: Optional[Model], + name: str = "lemmatizer", + *, + mode: str = "pymorphy2", + lookups: Optional[Lookups] = None, + ) -> None: + super().__init__(vocab, model, name, mode=mode, lookups=lookups) try: from pymorphy2 import MorphAnalyzer - - if UkrainianLemmatizer._morph is None: - UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk") - except (ImportError, TypeError): + except ImportError: raise ImportError( "The Ukrainian lemmatizer requires the pymorphy2 library and " 'dictionaries: try to fix it with "pip uninstall pymorphy2" and' '"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"' ) from None - - def __call__( - self, string: str, univ_pos: str, morphology: Optional[dict] = None - ) -> List[str]: - univ_pos = self.normalize_univ_pos(univ_pos) - if univ_pos == "PUNCT": - return [PUNCT_RULES.get(string, string)] - if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"): - # Skip unchangeable pos - return [string.lower()] - analyses = self._morph.parse(string) - filtered_analyses = [] - for analysis in analyses: - if not analysis.is_known: - # Skip suggested parse variant for unknown word for pymorphy - continue - analysis_pos, _ = oc2ud(str(analysis.tag)) - if analysis_pos == univ_pos or ( - analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN") - ): - filtered_analyses.append(analysis) - if not len(filtered_analyses): - return [string.lower()] - if morphology is None or (len(morphology) == 1 and POS in morphology): - return list(set([analysis.normal_form for analysis in filtered_analyses])) - if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"): - features_to_compare = ["Case", "Number", "Gender"] - elif univ_pos == "NUM": - features_to_compare = ["Case", "Gender"] - elif univ_pos == "PRON": - features_to_compare = ["Case", "Number", "Gender", "Person"] - else: # VERB - features_to_compare = [ - "Aspect", - "Gender", - "Mood", - "Number", - "Tense", - "VerbForm", - "Voice", - ] - analyses, filtered_analyses = filtered_analyses, [] - for analysis in analyses: - _, analysis_morph = oc2ud(str(analysis.tag)) - for feature in features_to_compare: - if ( - feature in morphology - and feature in analysis_morph - and morphology[feature].lower() != analysis_morph[feature].lower() - ): - break - else: - filtered_analyses.append(analysis) - if not len(filtered_analyses): - return [string.lower()] - return list(set([analysis.normal_form for analysis in filtered_analyses])) - - @staticmethod - def normalize_univ_pos(univ_pos: str) -> Optional[str]: - if isinstance(univ_pos, str): - return univ_pos.upper() - symbols_to_str = { - ADJ: "ADJ", - DET: "DET", - NOUN: "NOUN", - NUM: "NUM", - PRON: "PRON", - PROPN: "PROPN", - PUNCT: "PUNCT", - VERB: "VERB", - } - if univ_pos in symbols_to_str: - return symbols_to_str[univ_pos] - return None - - def lookup(self, string: str, orth: Optional[int] = None) -> str: - analyses = self._morph.parse(string) - if len(analyses) == 1: - return analyses[0].normal_form - return string - - -def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]: - gram_map = { - "_POS": { - "ADJF": "ADJ", - "ADJS": "ADJ", - "ADVB": "ADV", - "Apro": "DET", - "COMP": "ADJ", # Can also be an ADV - unchangeable - "CONJ": "CCONJ", # Can also be a SCONJ - both unchangeable ones - "GRND": "VERB", - "INFN": "VERB", - "INTJ": "INTJ", - "NOUN": "NOUN", - "NPRO": "PRON", - "NUMR": "NUM", - "NUMB": "NUM", - "PNCT": "PUNCT", - "PRCL": "PART", - "PREP": "ADP", - "PRTF": "VERB", - "PRTS": "VERB", - "VERB": "VERB", - }, - "Animacy": {"anim": "Anim", "inan": "Inan"}, - "Aspect": {"impf": "Imp", "perf": "Perf"}, - "Case": { - "ablt": "Ins", - "accs": "Acc", - "datv": "Dat", - "gen1": "Gen", - "gen2": "Gen", - "gent": "Gen", - "loc2": "Loc", - "loct": "Loc", - "nomn": "Nom", - "voct": "Voc", - }, - "Degree": {"COMP": "Cmp", "Supr": "Sup"}, - "Gender": {"femn": "Fem", "masc": "Masc", "neut": "Neut"}, - "Mood": {"impr": "Imp", "indc": "Ind"}, - "Number": {"plur": "Plur", "sing": "Sing"}, - "NumForm": {"NUMB": "Digit"}, - "Person": {"1per": "1", "2per": "2", "3per": "3", "excl": "2", "incl": "1"}, - "Tense": {"futr": "Fut", "past": "Past", "pres": "Pres"}, - "Variant": {"ADJS": "Brev", "PRTS": "Brev"}, - "VerbForm": { - "GRND": "Conv", - "INFN": "Inf", - "PRTF": "Part", - "PRTS": "Part", - "VERB": "Fin", - }, - "Voice": {"actv": "Act", "pssv": "Pass"}, - "Abbr": {"Abbr": "Yes"}, - } - pos = "X" - morphology = dict() - unmatched = set() - grams = oc_tag.replace(" ", ",").split(",") - for gram in grams: - match = False - for categ, gmap in sorted(gram_map.items()): - if gram in gmap: - match = True - if categ == "_POS": - pos = gmap[gram] - else: - morphology[categ] = gmap[gram] - if not match: - unmatched.add(gram) - while len(unmatched) > 0: - gram = unmatched.pop() - if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"): - pos = "PROPN" - elif gram == "Auxt": - pos = "AUX" - elif gram == "Pltm": - morphology["Number"] = "Ptan" - return pos, morphology + if UkrainianLemmatizer._morph is None: + UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk") diff --git a/spacy/language.py b/spacy/language.py index 9018af73c..96661915a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -29,7 +29,6 @@ from .lang.punctuation import TOKENIZER_INFIXES from .tokens import Doc from .lookups import load_lookups from .tokenizer import Tokenizer -from .lemmatizer import Lemmatizer from .errors import Errors, Warnings from .schemas import ConfigSchema from .git_info import GIT_VERSION @@ -87,22 +86,6 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]: return tokenizer_factory -@registry.lemmatizers("spacy.Lemmatizer.v1") -def create_lemmatizer() -> Callable[["Language"], "Lemmatizer"]: - """Registered function to create a lemmatizer. Returns a factory that takes - the nlp object and returns a Lemmatizer instance with data loaded in from - spacy-lookups-data, if the package is installed. - """ - # TODO: Will be replaced when the lemmatizer becomes a pipeline component - tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] - - def lemmatizer_factory(nlp: "Language") -> "Lemmatizer": - lookups = load_lookups(lang=nlp.lang, tables=tables, strict=False) - return Lemmatizer(lookups=lookups) - - return lemmatizer_factory - - class Language: """A text-processing pipeline. Usually you'll load this once per process, and pass the instance around your application. @@ -128,7 +111,6 @@ class Language: max_length: int = 10 ** 6, meta: Dict[str, Any] = {}, create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None, - create_lemmatizer: Optional[Callable[["Language"], Callable]] = None, **kwargs, ) -> None: """Initialise a Language object. @@ -146,8 +128,6 @@ class Language: 100,000 characters in one text. create_tokenizer (Callable): Function that takes the nlp object and returns a tokenizer. - create_lemmatizer (Callable): Function that takes the nlp object and - returns a lemmatizer. DOCS: https://spacy.io/api/language#init """ @@ -166,13 +146,9 @@ class Language: if vocab is True: vectors_name = meta.get("vectors", {}).get("name") - if not create_lemmatizer: - lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]} - create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"] vocab = create_vocab( self.lang, self.Defaults, - lemmatizer=create_lemmatizer(self), vectors_name=vectors_name, load_data=self._config["nlp"]["load_vocab_data"], ) @@ -1451,7 +1427,6 @@ class Language: filled["components"] = orig_pipeline config["components"] = orig_pipeline create_tokenizer = resolved["nlp"]["tokenizer"] - create_lemmatizer = resolved["nlp"]["lemmatizer"] before_creation = resolved["nlp"]["before_creation"] after_creation = resolved["nlp"]["after_creation"] after_pipeline_creation = resolved["nlp"]["after_pipeline_creation"] @@ -1467,7 +1442,6 @@ class Language: nlp = lang_cls( vocab=vocab, create_tokenizer=create_tokenizer, - create_lemmatizer=create_lemmatizer, ) if after_creation is not None: nlp = after_creation(nlp) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py deleted file mode 100644 index adba79686..000000000 --- a/spacy/lemmatizer.py +++ /dev/null @@ -1,145 +0,0 @@ -from typing import Optional, Callable, List, Dict - -from .lookups import Lookups -from .parts_of_speech import NAMES as UPOS_NAMES - - -class Lemmatizer: - """ - The Lemmatizer supports simple part-of-speech-sensitive suffix rules and - lookup tables. - - DOCS: https://spacy.io/api/lemmatizer - """ - - def __init__( - self, - lookups: Optional[Lookups] = None, - is_base_form: Optional[Callable] = None, - ) -> None: - """Initialize a Lemmatizer. - - lookups (Lookups): The lookups object containing the (optional) tables - "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". - """ - self.lookups = lookups if lookups is not None else Lookups() - self.is_base_form = is_base_form - - def __call__( - self, string: str, univ_pos: str, morphology: Optional[dict] = None - ) -> List[str]: - """Lemmatize a string. - - string (str): The string to lemmatize, e.g. the token text. - univ_pos (str / int): The token's universal part-of-speech tag. - morphology (dict): The token's morphological features following the - Universal Dependencies scheme. - RETURNS (list): The available lemmas for the string. - """ - lookup_table = self.lookups.get_table("lemma_lookup", {}) - if "lemma_rules" not in self.lookups: - return [lookup_table.get(string, string)] - if isinstance(univ_pos, int): - univ_pos = UPOS_NAMES.get(univ_pos, "X") - univ_pos = univ_pos.lower() - if univ_pos in ("", "eol", "space"): - return [string.lower()] - # See Issue #435 for example of where this logic is requied. - if callable(self.is_base_form) and self.is_base_form(univ_pos, morphology): - return [string.lower()] - index_table = self.lookups.get_table("lemma_index", {}) - exc_table = self.lookups.get_table("lemma_exc", {}) - rules_table = self.lookups.get_table("lemma_rules", {}) - if not any( - ( - index_table.get(univ_pos), - exc_table.get(univ_pos), - rules_table.get(univ_pos), - ) - ): - if univ_pos == "propn": - return [string] - else: - return [string.lower()] - lemmas = self.lemmatize( - string, - index_table.get(univ_pos, {}), - exc_table.get(univ_pos, {}), - rules_table.get(univ_pos, []), - ) - return lemmas - - def noun(self, string: str, morphology: Optional[dict] = None) -> List[str]: - return self(string, "noun", morphology) - - def verb(self, string: str, morphology: Optional[dict] = None) -> List[str]: - return self(string, "verb", morphology) - - def adj(self, string: str, morphology: Optional[dict] = None) -> List[str]: - return self(string, "adj", morphology) - - def det(self, string: str, morphology: Optional[dict] = None) -> List[str]: - return self(string, "det", morphology) - - def pron(self, string: str, morphology: Optional[dict] = None) -> List[str]: - return self(string, "pron", morphology) - - def adp(self, string: str, morphology: Optional[dict] = None) -> List[str]: - return self(string, "adp", morphology) - - def num(self, string: str, morphology: Optional[dict] = None) -> List[str]: - return self(string, "num", morphology) - - def punct(self, string: str, morphology: Optional[dict] = None) -> List[str]: - return self(string, "punct", morphology) - - def lookup(self, string: str, orth: Optional[int] = None) -> str: - """Look up a lemma in the table, if available. If no lemma is found, - the original string is returned. - - string (str): The original string. - orth (int): Optional hash of the string to look up. If not set, the - string will be used and hashed. - RETURNS (str): The lemma if the string was found, otherwise the - original string. - """ - lookup_table = self.lookups.get_table("lemma_lookup", {}) - key = orth if orth is not None else string - if key in lookup_table: - return lookup_table[key] - return string - - def lemmatize( - self, - string: str, - index: Dict[str, List[str]], - exceptions: Dict[str, Dict[str, List[str]]], - rules: Dict[str, List[List[str]]], - ) -> List[str]: - orig = string - string = string.lower() - forms = [] - oov_forms = [] - for old, new in rules: - if string.endswith(old): - form = string[: len(string) - len(old)] + new - if not form: - pass - elif form in index or not form.isalpha(): - forms.append(form) - else: - oov_forms.append(form) - # Remove duplicates but preserve the ordering of applied "rules" - forms = list(dict.fromkeys(forms)) - # Put exceptions at the front of the list, so they get priority. - # This is a dodgy heuristic -- but it's the best we can do until we get - # frequencies on this. We can at least prune out problematic exceptions, - # if they shadow more frequent analyses. - for form in exceptions.get(string, []): - if form not in forms: - forms.insert(0, form) - if not forms: - forms.extend(oov_forms) - if not forms: - forms.append(orig) - return forms diff --git a/spacy/lookups.py b/spacy/lookups.py index 7862b9805..d79a5b950 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -28,6 +28,8 @@ def load_lookups( # TODO: import spacy_lookups_data instead of going via entry points here? lookups = Lookups() if lang not in registry.lookups: + if strict and len(tables) > 0: + raise ValueError(Errors.E955.format(table=", ".join(tables), lang=lang)) return lookups data = registry.lookups.get(lang) for table in tables: @@ -41,152 +43,6 @@ def load_lookups( return lookups -class Lookups: - """Container for large lookup tables and dictionaries, e.g. lemmatization - data or tokenizer exception lists. Lookups are available via vocab.lookups, - so they can be accessed before the pipeline components are applied (e.g. - in the tokenizer and lemmatizer), as well as within the pipeline components - via doc.vocab.lookups. - """ - - def __init__(self) -> None: - """Initialize the Lookups object. - - DOCS: https://spacy.io/api/lookups#init - """ - self._tables = {} - - def __contains__(self, name: str) -> bool: - """Check if the lookups contain a table of a given name. Delegates to - Lookups.has_table. - - name (str): Name of the table. - RETURNS (bool): Whether a table of that name is in the lookups. - """ - return self.has_table(name) - - def __len__(self) -> int: - """RETURNS (int): The number of tables in the lookups.""" - return len(self._tables) - - @property - def tables(self) -> List[str]: - """RETURNS (List[str]): Names of all tables in the lookups.""" - return list(self._tables.keys()) - - def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> "Table": - """Add a new table to the lookups. Raises an error if the table exists. - - name (str): Unique name of table. - data (dict): Optional data to add to the table. - RETURNS (Table): The newly added table. - - DOCS: https://spacy.io/api/lookups#add_table - """ - if name in self.tables: - raise ValueError(Errors.E158.format(name=name)) - table = Table(name=name, data=data) - self._tables[name] = table - return table - - def get_table(self, name: str, default: Any = UNSET) -> "Table": - """Get a table. Raises an error if the table doesn't exist and no - default value is provided. - - name (str): Name of the table. - default (Any): Optional default value to return if table doesn't exist. - RETURNS (Table): The table. - - DOCS: https://spacy.io/api/lookups#get_table - """ - if name not in self._tables: - if default == UNSET: - raise KeyError(Errors.E159.format(name=name, tables=self.tables)) - return default - return self._tables[name] - - def remove_table(self, name: str) -> "Table": - """Remove a table. Raises an error if the table doesn't exist. - - name (str): Name of the table to remove. - RETURNS (Table): The removed table. - - DOCS: https://spacy.io/api/lookups#remove_table - """ - if name not in self._tables: - raise KeyError(Errors.E159.format(name=name, tables=self.tables)) - return self._tables.pop(name) - - def has_table(self, name: str) -> bool: - """Check if the lookups contain a table of a given name. - - name (str): Name of the table. - RETURNS (bool): Whether a table of that name exists. - - DOCS: https://spacy.io/api/lookups#has_table - """ - return name in self._tables - - def to_bytes(self, **kwargs) -> bytes: - """Serialize the lookups to a bytestring. - - RETURNS (bytes): The serialized Lookups. - - DOCS: https://spacy.io/api/lookups#to_bytes - """ - return srsly.msgpack_dumps(self._tables) - - def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups": - """Load the lookups from a bytestring. - - bytes_data (bytes): The data to load. - RETURNS (Lookups): The loaded Lookups. - - DOCS: https://spacy.io/api/lookups#from_bytes - """ - self._tables = {} - for key, value in srsly.msgpack_loads(bytes_data).items(): - self._tables[key] = Table(key, value) - return self - - def to_disk( - self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs - ) -> None: - """Save the lookups to a directory as lookups.bin. Expects a path to a - directory, which will be created if it doesn't exist. - - path (str / Path): The file path. - - DOCS: https://spacy.io/api/lookups#to_disk - """ - if len(self._tables): - path = ensure_path(path) - if not path.exists(): - path.mkdir() - filepath = path / filename - with filepath.open("wb") as file_: - file_.write(self.to_bytes()) - - def from_disk( - self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs - ) -> "Lookups": - """Load lookups from a directory containing a lookups.bin. Will skip - loading if the file doesn't exist. - - path (str / Path): The directory path. - RETURNS (Lookups): The loaded lookups. - - DOCS: https://spacy.io/api/lookups#from_disk - """ - path = ensure_path(path) - filepath = path / filename - if filepath.exists(): - with filepath.open("rb") as file_: - data = file_.read() - return self.from_bytes(data) - return self - - class Table(OrderedDict): """A table in the lookups. Subclass of builtin dict that implements a slightly more consistent and unified API. @@ -303,3 +159,159 @@ class Table(OrderedDict): self.clear() self.update(data) return self + + +class Lookups: + """Container for large lookup tables and dictionaries, e.g. lemmatization + data or tokenizer exception lists. Lookups are available via vocab.lookups, + so they can be accessed before the pipeline components are applied (e.g. + in the tokenizer and lemmatizer), as well as within the pipeline components + via doc.vocab.lookups. + """ + + def __init__(self) -> None: + """Initialize the Lookups object. + + DOCS: https://spacy.io/api/lookups#init + """ + self._tables = {} + + def __contains__(self, name: str) -> bool: + """Check if the lookups contain a table of a given name. Delegates to + Lookups.has_table. + + name (str): Name of the table. + RETURNS (bool): Whether a table of that name is in the lookups. + """ + return self.has_table(name) + + def __len__(self) -> int: + """RETURNS (int): The number of tables in the lookups.""" + return len(self._tables) + + @property + def tables(self) -> List[str]: + """RETURNS (List[str]): Names of all tables in the lookups.""" + return list(self._tables.keys()) + + def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> Table: + """Add a new table to the lookups. Raises an error if the table exists. + + name (str): Unique name of table. + data (dict): Optional data to add to the table. + RETURNS (Table): The newly added table. + + DOCS: https://spacy.io/api/lookups#add_table + """ + if name in self.tables: + raise ValueError(Errors.E158.format(name=name)) + table = Table(name=name, data=data) + self._tables[name] = table + return table + + def set_table(self, name: str, table: Table) -> None: + """Set a table. + + name (str): Name of the table to set. + table (Table): The Table to set. + + DOCS: https://spacy.io/api/lookups#set_table + """ + self._tables[name] = table + + def get_table(self, name: str, default: Any = UNSET) -> Table: + """Get a table. Raises an error if the table doesn't exist and no + default value is provided. + + name (str): Name of the table. + default (Any): Optional default value to return if table doesn't exist. + RETURNS (Table): The table. + + DOCS: https://spacy.io/api/lookups#get_table + """ + if name not in self._tables: + if default == UNSET: + raise KeyError(Errors.E159.format(name=name, tables=self.tables)) + return default + return self._tables[name] + + def remove_table(self, name: str) -> Table: + """Remove a table. Raises an error if the table doesn't exist. + + name (str): Name of the table to remove. + RETURNS (Table): The removed table. + + DOCS: https://spacy.io/api/lookups#remove_table + """ + if name not in self._tables: + raise KeyError(Errors.E159.format(name=name, tables=self.tables)) + return self._tables.pop(name) + + def has_table(self, name: str) -> bool: + """Check if the lookups contain a table of a given name. + + name (str): Name of the table. + RETURNS (bool): Whether a table of that name exists. + + DOCS: https://spacy.io/api/lookups#has_table + """ + return name in self._tables + + def to_bytes(self, **kwargs) -> bytes: + """Serialize the lookups to a bytestring. + + RETURNS (bytes): The serialized Lookups. + + DOCS: https://spacy.io/api/lookups#to_bytes + """ + return srsly.msgpack_dumps(self._tables) + + def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups": + """Load the lookups from a bytestring. + + bytes_data (bytes): The data to load. + RETURNS (Lookups): The loaded Lookups. + + DOCS: https://spacy.io/api/lookups#from_bytes + """ + self._tables = {} + for key, value in srsly.msgpack_loads(bytes_data).items(): + self._tables[key] = Table(key, value) + return self + + def to_disk( + self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs + ) -> None: + """Save the lookups to a directory as lookups.bin. Expects a path to a + directory, which will be created if it doesn't exist. + + path (str / Path): The file path. + + DOCS: https://spacy.io/api/lookups#to_disk + """ + if len(self._tables): + path = ensure_path(path) + if not path.exists(): + path.mkdir() + filepath = path / filename + with filepath.open("wb") as file_: + file_.write(self.to_bytes()) + + def from_disk( + self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs + ) -> "Lookups": + """Load lookups from a directory containing a lookups.bin. Will skip + loading if the file doesn't exist. + + path (str / Path): The directory path. + RETURNS (Lookups): The loaded lookups. + + DOCS: https://spacy.io/api/lookups#from_disk + """ + path = ensure_path(path) + filepath = path / filename + if filepath.exists(): + with filepath.open("rb") as file_: + data = file_.read() + return self.from_bytes(data) + return self diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 3dec1bc70..4fe8f7428 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -27,12 +27,6 @@ cdef class Morphology: cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except * cdef int insert(self, MorphAnalysisC tag) except -1 - cdef int assign_untagged(self, TokenC* token) except -1 - cdef int assign_tag(self, TokenC* token, tag) except -1 - cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 - - cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1 - cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil cdef list list_features(const MorphAnalysisC* morph) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index b2ba32a59..fcfe216ba 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -31,43 +31,15 @@ cdef class Morphology: VALUE_SEP = "," EMPTY_MORPH = "_" # not an empty string so that the PreshMap key is not 0 - def __init__(self, StringStore strings, tag_map, lemmatizer, exc=None): + def __init__(self, StringStore strings): self.mem = Pool() self.strings = strings self.tags = PreshMap() - self.load_tag_map(tag_map) - self.lemmatizer = lemmatizer - - self._cache = PreshMapArray(self.n_tags) - self._exc = {} - if exc is not None: - self.load_morph_exceptions(exc) - - def load_tag_map(self, tag_map): - self.tag_map = {} - self.reverse_index = {} - # Add special space symbol. We prefix with underscore, to make sure it - # always sorts to the end. - if '_SP' in tag_map: - space_attrs = tag_map.get('_SP') - else: - space_attrs = tag_map.get('SP', {POS: SPACE}) - if '_SP' not in tag_map: - self.strings.add('_SP') - tag_map = dict(tag_map) - tag_map['_SP'] = space_attrs - for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): - attrs = self.normalize_attrs(attrs) - self.add(attrs) - self.tag_map[tag_str] = dict(attrs) - self.reverse_index[self.strings.add(tag_str)] = i - self.tag_names = tuple(sorted(self.tag_map.keys())) - self.n_tags = len(self.tag_map) - self._cache = PreshMapArray(self.n_tags) def __reduce__(self): - return (Morphology, (self.strings, self.tag_map, self.lemmatizer, - self.exc), None, None) + tags = set([self.get(self.strings[s]) for s in self.strings]) + tags -= set([""]) + return (unpickle_morphology, (self.strings, sorted(tags)), None, None) def add(self, features): """Insert a morphological analysis in the morphology table, if not @@ -185,115 +157,6 @@ cdef class Morphology: else: return self.strings[tag.key] - def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): - if orth not in self.strings: - return orth - cdef unicode py_string = self.strings[orth] - if self.lemmatizer is None: - return self.strings.add(py_string.lower()) - cdef list lemma_strings - cdef unicode lemma_string - # Normalize features into a dict keyed by the field, to make life easier - # for the lemmatizer. Handles string-to-int conversion too. - string_feats = {} - for key, value in morphology.items(): - if value is True: - name, value = self.strings.as_string(key).split('_', 1) - string_feats[name] = value - else: - string_feats[self.strings.as_string(key)] = self.strings.as_string(value) - lemma_strings = self.lemmatizer(py_string, univ_pos, string_feats) - lemma_string = lemma_strings[0] - lemma = self.strings.add(lemma_string) - return lemma - - def add_special_case(self, unicode tag_str, unicode orth_str, attrs, - force=False): - """Add a special-case rule to the morphological analyser. Tokens whose - tag and orth match the rule will receive the specified properties. - - tag (str): The part-of-speech tag to key the exception. - orth (str): The word-form to key the exception. - """ - attrs = dict(attrs) - attrs = self.normalize_attrs(attrs) - self.add(attrs) - attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) - self._exc[(tag_str, self.strings.add(orth_str))] = attrs - - cdef int assign_untagged(self, TokenC* token) except -1: - """Set morphological attributes on a token without a POS tag. Uses - the lemmatizer's lookup() method, which looks up the string in the - table provided by the language data as lemma_lookup (if available). - """ - if token.lemma == 0: - orth_str = self.strings[token.lex.orth] - lemma = self.lemmatizer.lookup(orth_str, orth=token.lex.orth) - token.lemma = self.strings.add(lemma) - - cdef int assign_tag(self, TokenC* token, tag_str) except -1: - cdef attr_t tag = self.strings.as_int(tag_str) - if tag in self.reverse_index: - tag_id = self.reverse_index[tag] - self.assign_tag_id(token, tag_id) - else: - token.tag = tag - - cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: - if tag_id > self.n_tags: - raise ValueError(Errors.E014.format(tag=tag_id)) - # Ensure spaces get tagged as space. - # It seems pretty arbitrary to put this logic here, but there's really - # nowhere better. I guess the justification is that this is where the - # specific word and the tag interact. Still, we should have a better - # way to enforce this rule, or figure out why the statistical model fails. - # Related to Issue #220 - if Lexeme.c_check_flag(token.lex, IS_SPACE): - tag_id = self.reverse_index[self.strings.add('_SP')] - tag_str = self.tag_names[tag_id] - features = dict(self.tag_map.get(tag_str, {})) - if features: - pos = self.strings.as_int(features.pop(POS)) - else: - pos = 0 - cdef attr_t lemma = self._cache.get(tag_id, token.lex.orth) - if lemma == 0: - # Ugh, self.lemmatize has opposite arg order from self.lemmatizer :( - lemma = self.lemmatize(pos, token.lex.orth, features) - self._cache.set(tag_id, token.lex.orth, lemma) - token.lemma = lemma - token.pos = pos - token.tag = self.strings[tag_str] - token.morph = self.add(features) - if (self.tag_names[tag_id], token.lex.orth) in self._exc: - self._assign_tag_from_exceptions(token, tag_id) - - cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1: - key = (self.tag_names[tag_id], token.lex.orth) - cdef dict attrs - attrs = self._exc[key] - token.pos = attrs.get(POS, token.pos) - token.lemma = attrs.get(LEMMA, token.lemma) - - def load_morph_exceptions(self, dict morph_rules): - self._exc = {} - # Map (form, pos) to attributes - for tag, exc in morph_rules.items(): - for orth, attrs in exc.items(): - attrs = self.normalize_attrs(attrs) - self.add_special_case(self.strings.as_string(tag), self.strings.as_string(orth), attrs) - - @property - def exc(self): - # generate the serializable exc in the MORPH_RULES format from the - # internal tuple-key format - morph_rules = {} - for (tag, orth) in sorted(self._exc): - if not tag in morph_rules: - morph_rules[tag] = {} - morph_rules[tag][self.strings[orth]] = self._exc[(tag, orth)] - return morph_rules - @staticmethod def feats_to_dict(feats): if not feats or feats == Morphology.EMPTY_MORPH: @@ -338,3 +201,9 @@ cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t fie results[n_results] = morph.features[i] n_results += 1 return n_results + +def unpickle_morphology(strings, tags): + cdef Morphology morphology = Morphology(strings) + for tag in tags: + morphology.add(tag) + return morphology diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index 7f395b5f2..793aa83c3 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -3,9 +3,10 @@ from .dep_parser import DependencyParser from .entity_linker import EntityLinker from .ner import EntityRecognizer from .entityruler import EntityRuler +from .lemmatizer import Lemmatizer from .morphologizer import Morphologizer from .pipe import Pipe -from spacy.pipeline.senter import SentenceRecognizer +from .senter import SentenceRecognizer from .sentencizer import Sentencizer from .simple_ner import SimpleNER from .tagger import Tagger @@ -20,6 +21,7 @@ __all__ = [ "EntityRecognizer", "EntityRuler", "Morphologizer", + "Lemmatizer", "Pipe", "SentenceRecognizer", "Sentencizer", diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py new file mode 100644 index 000000000..f2028772f --- /dev/null +++ b/spacy/pipeline/lemmatizer.py @@ -0,0 +1,330 @@ +from typing import Optional, List, Dict, Any + +from thinc.api import Model + +from .pipe import Pipe +from ..errors import Errors +from ..language import Language +from ..lookups import Lookups, load_lookups +from ..scorer import Scorer +from ..tokens import Doc, Token +from ..vocab import Vocab +from .. import util + + +@Language.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={ + "model": None, + "mode": "lookup", + "lookups": None, + "overwrite": False, + }, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], + overwrite: bool = False, +): + lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) + return Lemmatizer( + nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite + ) + + +class Lemmatizer(Pipe): + """ + The Lemmatizer supports simple part-of-speech-sensitive suffix rules and + lookup tables. + + DOCS: https://spacy.io/api/lemmatizer + """ + + @classmethod + def get_lookups_config(cls, mode: str) -> Dict: + """Returns the lookups configuration settings for a given mode for use + in Lemmatizer.load_lookups. + + mode (str): The lemmatizer mode. + RETURNS (dict): The lookups configuration settings for this mode. + + DOCS: https://spacy.io/api/lemmatizer#get_lookups_config + """ + if mode == "lookup": + return { + "required_tables": ["lemma_lookup"], + } + elif mode == "rule": + return { + "required_tables": ["lemma_rules"], + "optional_tables": ["lemma_exc", "lemma_index"], + } + return {} + + @classmethod + def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups],) -> Lookups: + """Load and validate lookups tables. If the provided lookups is None, + load the default lookups tables according to the language and mode + settings. Confirm that all required tables for the language and mode + are present. + + lang (str): The language code. + mode (str): The lemmatizer mode. + lookups (Lookups): The provided lookups, may be None if the default + lookups should be loaded. + RETURNS (Lookups): The Lookups object. + + DOCS: https://spacy.io/api/lemmatizer#get_lookups_config + """ + config = cls.get_lookups_config(mode) + required_tables = config.get("required_tables", []) + optional_tables = config.get("optional_tables", []) + if lookups is None: + lookups = load_lookups(lang=lang, tables=required_tables) + optional_lookups = load_lookups( + lang=lang, tables=optional_tables, strict=False + ) + for table in optional_lookups.tables: + lookups.set_table(table, optional_lookups.get_table(table)) + for table in required_tables: + if table not in lookups: + raise ValueError( + Errors.E1004.format( + mode=mode, tables=required_tables, found=lookups.tables + ) + ) + return lookups + + def __init__( + self, + vocab: Vocab, + model: Optional[Model], + name: str = "lemmatizer", + *, + mode: str = "lookup", + lookups: Optional[Lookups] = None, + overwrite: bool = False, + ) -> None: + """Initialize a Lemmatizer. + + vocab (Vocab): The vocab. + model (Model): A model (not yet implemented). + name (str): The component name. Defaults to "lemmatizer". + mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup". + lookups (Lookups): The lookups object containing the (optional) tables + such as "lemma_rules", "lemma_index", "lemma_exc" and + "lemma_lookup". Defaults to None + overwrite (bool): Whether to overwrite existing lemmas. Defaults to + `False`. + + DOCS: https://spacy.io/api/lemmatizer#init + """ + self.vocab = vocab + self.model = model + self._mode = mode + self.lookups = lookups if lookups is not None else Lookups() + self.overwrite = overwrite + if self.mode == "lookup": + self.lemmatize = self.lookup_lemmatize + elif self.mode == "rule": + self.lemmatize = self.rule_lemmatize + else: + try: + self.lemmatize = getattr(self, f"{self.mode}_lemmatize") + except AttributeError: + raise ValueError(Errors.E1003.format(mode=mode)) + self.cache = {} + + @property + def mode(self): + return self._mode + + def __call__(self, doc: Doc) -> Doc: + """Apply the lemmatizer to one document. + + doc (Doc): The Doc to process. + RETURNS (Doc): The processed Doc. + + DOCS: https://spacy.io/api/lemmatizer#call + """ + for token in doc: + if self.overwrite or token.lemma == 0: + token.lemma_ = self.lemmatize(token)[0] + return doc + + def pipe(self, stream, *, batch_size=128): + """Apply the pipe to a stream of documents. This usually happens under + the hood when the nlp object is called on a text and all components are + applied to the Doc. + + stream (Iterable[Doc]): A stream of documents. + batch_size (int): The number of documents to buffer. + YIELDS (Doc): Processed documents in order. + + DOCS: https://spacy.io/api/lemmatizer#pipe + """ + for doc in stream: + doc = self(doc) + yield doc + + def lookup_lemmatize(self, token: Token) -> List[str]: + """Lemmatize using a lookup-based approach. + + token (Token): The token to lemmatize. + RETURNS (list): The available lemmas for the string. + + DOCS: https://spacy.io/api/lemmatizer#lookup_lemmatize + """ + lookup_table = self.lookups.get_table("lemma_lookup", {}) + result = lookup_table.get(token.text, token.text) + if isinstance(result, str): + result = [result] + return result + + def rule_lemmatize(self, token: Token) -> List[str]: + """Lemmatize using a rule-based approach. + + token (Token): The token to lemmatize. + RETURNS (list): The available lemmas for the string. + + DOCS: https://spacy.io/api/lemmatizer#rule_lemmatize + """ + cache_key = (token.orth, token.pos, token.morph) + if cache_key in self.cache: + return self.cache[cache_key] + string = token.text + univ_pos = token.pos_.lower() + if univ_pos in ("", "eol", "space"): + return [string.lower()] + # See Issue #435 for example of where this logic is requied. + if self.is_base_form(token): + return [string.lower()] + index_table = self.lookups.get_table("lemma_index", {}) + exc_table = self.lookups.get_table("lemma_exc", {}) + rules_table = self.lookups.get_table("lemma_rules", {}) + if not any( + ( + index_table.get(univ_pos), + exc_table.get(univ_pos), + rules_table.get(univ_pos), + ) + ): + if univ_pos == "propn": + return [string] + else: + return [string.lower()] + + index = index_table.get(univ_pos, {}) + exceptions = exc_table.get(univ_pos, {}) + rules = rules_table.get(univ_pos, {}) + orig = string + string = string.lower() + forms = [] + oov_forms = [] + for old, new in rules: + if string.endswith(old): + form = string[: len(string) - len(old)] + new + if not form: + pass + elif form in index or not form.isalpha(): + forms.append(form) + else: + oov_forms.append(form) + # Remove duplicates but preserve the ordering of applied "rules" + forms = list(dict.fromkeys(forms)) + # Put exceptions at the front of the list, so they get priority. + # This is a dodgy heuristic -- but it's the best we can do until we get + # frequencies on this. We can at least prune out problematic exceptions, + # if they shadow more frequent analyses. + for form in exceptions.get(string, []): + if form not in forms: + forms.insert(0, form) + if not forms: + forms.extend(oov_forms) + if not forms: + forms.append(orig) + self.cache[cache_key] = forms + return forms + + def is_base_form(self, token: Token) -> bool: + """Check whether the token is a base form that does not need further + analysis for lemmatization. + + token (Token): The token. + RETURNS (bool): Whether the token is a base form. + + DOCS: https://spacy.io/api/lemmatizer#is_base_form + """ + return False + + def score(self, examples, **kwargs) -> Dict[str, Any]: + """Score a batch of examples. + + examples (Iterable[Example]): The examples to score. + RETURNS (Dict[str, Any]): The scores. + + DOCS: https://spacy.io/api/lemmatizer#score + """ + return Scorer.score_token_attr(examples, "lemma", **kwargs) + + def to_disk(self, path, *, exclude=tuple()): + """Save the current state to a directory. + + path (unicode or Path): A path to a directory, which will be created if + it doesn't exist. + exclude (list): String names of serialization fields to exclude. + + DOCS: https://spacy.io/api/vocab#to_disk + """ + serialize = {} + serialize["vocab"] = lambda p: self.vocab.to_disk(p) + serialize["lookups"] = lambda p: self.lookups.to_disk(p) + util.to_disk(path, serialize, exclude) + + def from_disk(self, path, *, exclude=tuple()): + """Loads state from a directory. Modifies the object in place and + returns it. + + path (unicode or Path): A path to a directory. + exclude (list): String names of serialization fields to exclude. + RETURNS (Vocab): The modified `Vocab` object. + + DOCS: https://spacy.io/api/vocab#to_disk + """ + deserialize = {} + deserialize["vocab"] = lambda p: self.vocab.from_disk(p) + deserialize["lookups"] = lambda p: self.lookups.from_disk(p) + util.from_disk(path, deserialize, exclude) + + def to_bytes(self, *, exclude=tuple()) -> bytes: + """Serialize the current state to a binary string. + + exclude (list): String names of serialization fields to exclude. + RETURNS (bytes): The serialized form of the `Vocab` object. + + DOCS: https://spacy.io/api/vocab#to_bytes + """ + serialize = {} + serialize["vocab"] = self.vocab.to_bytes + serialize["lookups"] = self.lookups.to_bytes + return util.to_bytes(serialize, exclude) + + def from_bytes(self, bytes_data: bytes, *, exclude=tuple()): + """Load state from a binary string. + + bytes_data (bytes): The data to load from. + exclude (list): String names of serialization fields to exclude. + RETURNS (Vocab): The `Vocab` object. + + DOCS: https://spacy.io/api/vocab#from_bytes + """ + deserialize = {} + deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) + deserialize["lookups"] = lambda b: self.lookups.from_bytes(b) + util.from_bytes(bytes_data, deserialize, exclude) diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index da1b3d3aa..f1515889b 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -39,12 +39,12 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "tagger", assigns=["token.tag"], - default_config={"model": DEFAULT_TAGGER_MODEL, "set_morphology": False}, - scores=["tag_acc", "pos_acc", "lemma_acc"], + default_config={"model": DEFAULT_TAGGER_MODEL}, + scores=["tag_acc"], default_score_weights={"tag_acc": 1.0}, ) -def make_tagger(nlp: Language, name: str, model: Model, set_morphology: bool): - return Tagger(nlp.vocab, model, name, set_morphology=set_morphology) +def make_tagger(nlp: Language, name: str, model: Model): + return Tagger(nlp.vocab, model, name) class Tagger(Pipe): @@ -52,13 +52,14 @@ class Tagger(Pipe): DOCS: https://spacy.io/api/tagger """ - def __init__(self, vocab, model, name="tagger", *, set_morphology=False): + def __init__(self, vocab, model, name="tagger", *, labels=None): """Initialize a part-of-speech tagger. vocab (Vocab): The shared vocabulary. model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. + labels (List): The set of labels. Defaults to None. set_morphology (bool): Whether to set morphological features. DOCS: https://spacy.io/api/tagger#init @@ -67,7 +68,7 @@ class Tagger(Pipe): self.model = model self.name = name self._rehearsal_model = None - cfg = {"set_morphology": set_morphology} + cfg = {"labels": labels or []} self.cfg = dict(sorted(cfg.items())) @property @@ -80,7 +81,7 @@ class Tagger(Pipe): DOCS: https://spacy.io/api/tagger#labels """ - return tuple(self.vocab.morphology.tag_names) + return tuple(self.cfg["labels"]) def __call__(self, doc): """Apply the pipe to a Doc. @@ -150,9 +151,7 @@ class Tagger(Pipe): if isinstance(docs, Doc): docs = [docs] cdef Doc doc - cdef int idx = 0 cdef Vocab vocab = self.vocab - assign_morphology = self.cfg.get("set_morphology", True) for i, doc in enumerate(docs): doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, "get"): @@ -160,15 +159,7 @@ class Tagger(Pipe): for j, tag_id in enumerate(doc_tag_ids): # Don't clobber preset POS tags if doc.c[j].tag == 0: - if doc.c[j].pos == 0 and assign_morphology: - # Don't clobber preset lemmas - lemma = doc.c[j].lemma - vocab.morphology.assign_tag_id(&doc.c[j], tag_id) - if lemma != 0 and lemma != doc.c[j].lex.orth: - doc.c[j].lemma = lemma - else: - doc.c[j].tag = self.vocab.strings[self.labels[tag_id]] - idx += 1 + doc.c[j].tag = self.vocab.strings[self.labels[tag_id]] doc.is_tagged = True def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False): @@ -279,55 +270,26 @@ class Tagger(Pipe): DOCS: https://spacy.io/api/tagger#begin_training """ - lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] - if not any(table in self.vocab.lookups for table in lemma_tables): - warnings.warn(Warnings.W022) - lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) - if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: - langs = ", ".join(util.LEXEME_NORM_LANGS) - warnings.warn(Warnings.W033.format(model="part-of-speech tagger", langs=langs)) - orig_tag_map = dict(self.vocab.morphology.tag_map) - new_tag_map = {} + tags = set() for example in get_examples(): try: y = example.y except AttributeError: raise TypeError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example))) from None for token in y: - tag = token.tag_ - if tag in orig_tag_map: - new_tag_map[tag] = orig_tag_map[tag] - else: - new_tag_map[tag] = {POS: X} - - cdef Vocab vocab = self.vocab - if new_tag_map: - if "_SP" in orig_tag_map: - new_tag_map["_SP"] = orig_tag_map["_SP"] - vocab.morphology.load_tag_map(new_tag_map) + tags.add(token.tag_) + for tag in sorted(tags): + self.add_label(tag) self.set_output(len(self.labels)) - doc_sample = [Doc(self.vocab, words=["hello", "world"])] - if pipeline is not None: - for name, component in pipeline: - if component is self: - break - if hasattr(component, "pipe"): - doc_sample = list(component.pipe(doc_sample)) - else: - doc_sample = [component(doc) for doc in doc_sample] - self.model.initialize(X=doc_sample) - # Get batch of example docs, example outputs to call begin_training(). - # This lets the model infer shapes. + self.model.initialize() if sgd is None: sgd = self.create_optimizer() return sgd - def add_label(self, label, values=None): + def add_label(self, label): """Add a new label to the pipe. label (str): The label to add. - values (Dict[int, str]): Optional values to map to the label, e.g. a - tag map dictionary. RETURNS (int): 0 if label is already present, otherwise 1. DOCS: https://spacy.io/api/tagger#add_label @@ -336,22 +298,8 @@ class Tagger(Pipe): raise ValueError(Errors.E187) if label in self.labels: return 0 - if self.model.has_dim("nO"): - # Here's how the model resizing will work, once the - # neuron-to-tag mapping is no longer controlled by - # the Morphology class, which sorts the tag names. - # The sorting makes adding labels difficult. - # smaller = self.model._layers[-1] - # larger = Softmax(len(self.labels)+1, smaller.nI) - # copy_array(larger.W[:smaller.nO], smaller.W) - # copy_array(larger.b[:smaller.nO], smaller.b) - # self.model._layers[-1] = larger - raise ValueError(TempErrors.T003) - tag_map = dict(self.vocab.morphology.tag_map) - if values is None: - values = {POS: "X"} - tag_map[label] = values - self.vocab.morphology.load_tag_map(tag_map) + self.cfg["labels"].append(label) + self.vocab.strings.add(label) return 1 def score(self, examples, **kwargs): @@ -363,11 +311,7 @@ class Tagger(Pipe): DOCS: https://spacy.io/api/tagger#score """ - scores = {} - scores.update(Scorer.score_token_attr(examples, "tag", **kwargs)) - scores.update(Scorer.score_token_attr(examples, "pos", **kwargs)) - scores.update(Scorer.score_token_attr(examples, "lemma", **kwargs)) - return scores + return Scorer.score_token_attr(examples, "tag", **kwargs) def to_bytes(self, *, exclude=tuple()): """Serialize the pipe to a bytestring. @@ -381,10 +325,6 @@ class Tagger(Pipe): serialize["model"] = self.model.to_bytes serialize["vocab"] = self.vocab.to_bytes serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) - tag_map = dict(sorted(self.vocab.morphology.tag_map.items())) - serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map) - morph_rules = dict(self.vocab.morphology.exc) - serialize["morph_rules"] = lambda: srsly.msgpack_dumps(morph_rules) return util.to_bytes(serialize, exclude) def from_bytes(self, bytes_data, *, exclude=tuple()): @@ -402,21 +342,8 @@ class Tagger(Pipe): except AttributeError: raise ValueError(Errors.E149) from None - def load_tag_map(b): - tag_map = srsly.msgpack_loads(b) - self.vocab.morphology.load_tag_map(tag_map) - - def load_morph_rules(b): - morph_rules = srsly.msgpack_loads(b) - self.vocab.morphology.load_morph_exceptions(morph_rules) - - self.vocab.morphology = Morphology(self.vocab.strings, dict(), - lemmatizer=self.vocab.morphology.lemmatizer) - deserialize = { "vocab": lambda b: self.vocab.from_bytes(b), - "tag_map": load_tag_map, - "morph_rules": load_morph_rules, "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), "model": lambda b: load_model(b), } @@ -431,12 +358,8 @@ class Tagger(Pipe): DOCS: https://spacy.io/api/tagger#to_disk """ - tag_map = dict(sorted(self.vocab.morphology.tag_map.items())) - morph_rules = dict(self.vocab.morphology.exc) serialize = { "vocab": lambda p: self.vocab.to_disk(p), - "tag_map": lambda p: srsly.write_msgpack(p, tag_map), - "morph_rules": lambda p: srsly.write_msgpack(p, morph_rules), "model": lambda p: self.model.to_disk(p), "cfg": lambda p: srsly.write_json(p, self.cfg), } @@ -458,22 +381,9 @@ class Tagger(Pipe): except AttributeError: raise ValueError(Errors.E149) from None - def load_tag_map(p): - tag_map = srsly.read_msgpack(p) - self.vocab.morphology.load_tag_map(tag_map) - - def load_morph_rules(p): - morph_rules = srsly.read_msgpack(p) - self.vocab.morphology.load_morph_exceptions(morph_rules) - - self.vocab.morphology = Morphology(self.vocab.strings, dict(), - lemmatizer=self.vocab.morphology.lemmatizer) - deserialize = { "vocab": lambda p: self.vocab.from_disk(p), "cfg": lambda p: self.cfg.update(deserialize_config(p)), - "tag_map": load_tag_map, - "morph_rules": load_morph_rules, "model": load_model, } util.from_disk(path, deserialize, exclude) diff --git a/spacy/schemas.py b/spacy/schemas.py index d599ccbb2..0f2a35c60 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -220,7 +220,6 @@ class ConfigSchemaNlp(BaseModel): lang: StrictStr = Field(..., title="The base language to use") pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") tokenizer: Callable = Field(..., title="The tokenizer to use") - lemmatizer: Callable = Field(..., title="The lemmatizer to use") load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data") before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization") after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed") diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index cfdb8e4ff..1c0595672 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -201,7 +201,7 @@ def ru_tokenizer(): @pytest.fixture def ru_lemmatizer(): pytest.importorskip("pymorphy2") - return get_lang_class("ru")().vocab.morphology.lemmatizer + return get_lang_class("ru")().add_pipe("lemmatizer") @pytest.fixture(scope="session") diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py index 3ee833aa8..0dc6c4866 100644 --- a/spacy/tests/doc/test_creation.py +++ b/spacy/tests/doc/test_creation.py @@ -1,21 +1,12 @@ import pytest from spacy.vocab import Vocab from spacy.tokens import Doc -from spacy.lemmatizer import Lemmatizer -from spacy.lookups import Lookups from spacy import util @pytest.fixture -def lemmatizer(): - lookups = Lookups() - lookups.add_table("lemma_lookup", {"dogs": "dog", "boxen": "box", "mice": "mouse"}) - return Lemmatizer(lookups) - - -@pytest.fixture -def vocab(lemmatizer): - return Vocab(lemmatizer=lemmatizer) +def vocab(): + return Vocab() def test_empty_doc(vocab): @@ -30,14 +21,6 @@ def test_single_word(vocab): assert doc.text == "a" -def test_lookup_lemmatization(vocab): - doc = Doc(vocab, words=["dogs", "dogses"]) - assert doc[0].text == "dogs" - assert doc[0].lemma_ == "dog" - assert doc[1].text == "dogses" - assert doc[1].lemma_ == "dogses" - - def test_create_from_words_and_text(vocab): # no whitespace in words words = ["'", "dogs", "'", "run"] diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py index 88557d100..6bfc198fd 100644 --- a/spacy/tests/doc/test_morphanalysis.py +++ b/spacy/tests/doc/test_morphanalysis.py @@ -1,23 +1,17 @@ import pytest -from spacy.symbols import POS, PRON, VERB @pytest.fixture def i_has(en_tokenizer): doc = en_tokenizer("I has") - tag_map = { - "PRP": {POS: PRON, "PronType": "prs"}, - "VBZ": { - POS: VERB, - "VerbForm": "fin", - "Tense": "pres", - "Number": "sing", - "Person": "three", - }, + doc[0].morph_ = {"PronType": "prs"} + doc[1].morph_ = { + "VerbForm": "fin", + "Tense": "pres", + "Number": "sing", + "Person": "three", } - en_tokenizer.vocab.morphology.load_tag_map(tag_map) - doc[0].tag_ = "PRP" - doc[1].tag_ = "VBZ" + return doc diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index e941b48ed..bc9567b2a 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -124,7 +124,6 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer): assert doc[0].text == "The players" assert doc[0].tag_ == "NN" assert doc[0].pos_ == "NOUN" - assert doc[0].lemma_ == "The players" doc = get_doc( tokens.vocab, words=[t.text for t in tokens], @@ -143,11 +142,9 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer): assert doc[0].text == "The players" assert doc[0].tag_ == "NN" assert doc[0].pos_ == "NOUN" - assert doc[0].lemma_ == "The players" assert doc[1].text == "start ." assert doc[1].tag_ == "VBZ" assert doc[1].pos_ == "VERB" - assert doc[1].lemma_ == "start ." def test_doc_retokenize_spans_merge_heads(en_tokenizer): diff --git a/spacy/tests/lang/en/test_tagger.py b/spacy/tests/lang/en/test_tagger.py deleted file mode 100644 index c4dc18bba..000000000 --- a/spacy/tests/lang/en/test_tagger.py +++ /dev/null @@ -1,21 +0,0 @@ -from spacy.symbols import POS, PRON, VERB, DET, NOUN, PUNCT -from ...util import get_doc - - -def test_en_tagger_load_morph_exc(en_tokenizer): - text = "I like his style." - tags = ["PRP", "VBP", "PRP$", "NN", "."] - tag_map = { - "PRP": {POS: PRON}, - "VBP": {POS: VERB}, - "PRP$": {POS: DET}, - "NN": {POS: NOUN}, - ".": {POS: PUNCT}, - } - morph_exc = {"VBP": {"like": {"lemma": "luck"}}} - en_tokenizer.vocab.morphology.load_tag_map(tag_map) - en_tokenizer.vocab.morphology.load_morph_exceptions(morph_exc) - tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags) - assert doc[1].tag_ == "VBP" - assert doc[1].lemma_ == "luck" diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py index 8a87a7506..bcf103b65 100644 --- a/spacy/tests/lang/ru/test_lemmatizer.py +++ b/spacy/tests/lang/ru/test_lemmatizer.py @@ -3,15 +3,16 @@ import pytest from ...util import get_doc -@pytest.mark.xfail(reason="TODO: investigate why lemmatizer fails here") -def test_ru_doc_lemmatization(ru_tokenizer): +def test_ru_doc_lemmatization(ru_lemmatizer): words = ["мама", "мыла", "раму"] - tags = [ - "NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing", - "VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act", - "NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing", + pos = ["NOUN", "VERB", "NOUN"] + morphs = [ + "Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing", + "Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act", + "Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing", ] - doc = get_doc(ru_tokenizer.vocab, words=words, tags=tags) + doc = get_doc(ru_lemmatizer.vocab, words=words, pos=pos, morphs=morphs) + doc = ru_lemmatizer(doc) lemmas = [token.lemma_ for token in doc] assert lemmas == ["мама", "мыть", "рама"] @@ -27,43 +28,51 @@ def test_ru_doc_lemmatization(ru_tokenizer): ], ) def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas): - assert sorted(ru_lemmatizer.noun(text)) == lemmas + doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"]) + result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0]) + assert sorted(result_lemmas) == lemmas @pytest.mark.parametrize( - "text,pos,morphology,lemma", + "text,pos,morph,lemma", [ - ("рой", "NOUN", None, "рой"), - ("рой", "VERB", None, "рыть"), - ("клей", "NOUN", None, "клей"), - ("клей", "VERB", None, "клеить"), - ("три", "NUM", None, "три"), - ("кос", "NOUN", {"Number": "Sing"}, "кос"), - ("кос", "NOUN", {"Number": "Plur"}, "коса"), - ("кос", "ADJ", None, "косой"), - ("потом", "NOUN", None, "пот"), - ("потом", "ADV", None, "потом"), + ("рой", "NOUN", "", "рой"), + ("рой", "VERB", "", "рыть"), + ("клей", "NOUN", "", "клей"), + ("клей", "VERB", "", "клеить"), + ("три", "NUM", "", "три"), + ("кос", "NOUN", "Number=Sing", "кос"), + ("кос", "NOUN", "Number=Plur", "коса"), + ("кос", "ADJ", "", "косой"), + ("потом", "NOUN", "", "пот"), + ("потом", "ADV", "", "потом"), ], ) def test_ru_lemmatizer_works_with_different_pos_homonyms( - ru_lemmatizer, text, pos, morphology, lemma + ru_lemmatizer, text, pos, morph, lemma ): - assert ru_lemmatizer(text, pos, morphology) == [lemma] + doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=[pos], morphs=[morph]) + result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0]) + assert result_lemmas == [lemma] @pytest.mark.parametrize( - "text,morphology,lemma", + "text,morph,lemma", [ - ("гвоздики", {"Gender": "Fem"}, "гвоздика"), - ("гвоздики", {"Gender": "Masc"}, "гвоздик"), - ("вина", {"Gender": "Fem"}, "вина"), - ("вина", {"Gender": "Neut"}, "вино"), + ("гвоздики", "Gender=Fem", "гвоздика"), + ("гвоздики", "Gender=Masc", "гвоздик"), + ("вина", "Gender=Fem", "вина"), + ("вина", "Gender=Neut", "вино"), ], ) -def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morphology, lemma): - assert ru_lemmatizer.noun(text, morphology) == [lemma] +def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morph, lemma): + doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"], morphs=[morph]) + result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0]) + assert result_lemmas == [lemma] def test_ru_lemmatizer_punct(ru_lemmatizer): - assert ru_lemmatizer.punct("«") == ['"'] - assert ru_lemmatizer.punct("»") == ['"'] + doc = get_doc(ru_lemmatizer.vocab, words=["«"], pos=["PUNCT"]) + assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"'] + doc = get_doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"]) + assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"'] diff --git a/spacy/tests/lang/test_lemmatizers.py b/spacy/tests/lang/test_lemmatizers.py new file mode 100644 index 000000000..8c235c86e --- /dev/null +++ b/spacy/tests/lang/test_lemmatizers.py @@ -0,0 +1,34 @@ +import pytest +from spacy import registry +from spacy.lookups import Lookups +from spacy.util import get_lang_class + + +# fmt: off +# Only include languages with no external dependencies +# excluded: ru, uk +# excluded for custom tables: pl +LANGUAGES = ["el", "en", "fr", "nl"] +# fmt: on + + +@pytest.mark.parametrize("lang", LANGUAGES) +def test_lemmatizer_initialize(lang, capfd): + @registry.assets("lemmatizer_init_lookups") + def lemmatizer_init_lookups(): + lookups = Lookups() + lookups.add_table("lemma_lookup", {"cope": "cope"}) + lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) + lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) + lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) + return lookups + + """Test that languages can be initialized.""" + nlp = get_lang_class(lang)() + nlp.add_pipe( + "lemmatizer", config={"lookups": {"@assets": "lemmatizer_init_lookups"}} + ) + # Check for stray print statements (see #3342) + doc = nlp("test") # noqa: F841 + captured = capfd.readouterr() + assert not captured.out diff --git a/spacy/tests/morphology/test_morph_features.py b/spacy/tests/morphology/test_morph_features.py index f644a5867..0693da690 100644 --- a/spacy/tests/morphology/test_morph_features.py +++ b/spacy/tests/morphology/test_morph_features.py @@ -1,14 +1,11 @@ import pytest from spacy.morphology import Morphology from spacy.strings import StringStore, get_string_id -from spacy.lemmatizer import Lemmatizer -from spacy.lookups import Lookups @pytest.fixture def morphology(): - lemmatizer = Lemmatizer(Lookups()) - return Morphology(StringStore(), {}, lemmatizer) + return Morphology(StringStore()) def test_init(morphology): diff --git a/spacy/tests/morphology/test_morph_pickle.py b/spacy/tests/morphology/test_morph_pickle.py index 2c374e11f..0758a6c01 100644 --- a/spacy/tests/morphology/test_morph_pickle.py +++ b/spacy/tests/morphology/test_morph_pickle.py @@ -2,21 +2,18 @@ import pytest import pickle from spacy.morphology import Morphology from spacy.strings import StringStore -from spacy.lemmatizer import Lemmatizer -from spacy.lookups import Lookups @pytest.fixture def morphology(): - tag_map = {"A": {"POS": "X"}, "B": {"POS": "NOUN"}} - exc = {"A": {"a": {"POS": "VERB"}}} - lemmatizer = Lemmatizer(Lookups()) - return Morphology(StringStore(), tag_map, lemmatizer, exc=exc) + morphology = Morphology(StringStore()) + morphology.add("Feat1=Val1|Feat2=Val2") + morphology.add("Feat3=Val3|Feat4=Val4") + return morphology def test_morphology_pickle_roundtrip(morphology): b = pickle.dumps(morphology) reloaded_morphology = pickle.loads(b) - - assert morphology.tag_map == reloaded_morphology.tag_map - assert morphology.exc == reloaded_morphology.exc + assert reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"]) == "Feat1=Val1|Feat2=Val2" + assert reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"]) == "Feat3=Val3|Feat4=Val4" diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 45ae09702..8265a8a45 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -82,10 +82,10 @@ def test_parser_merge_pp(en_tokenizer): text = "A phrase with another phrase occurs" heads = [1, 4, -1, 1, -2, 0] deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT"] - tags = ["DT", "NN", "IN", "DT", "NN", "VBZ"] + pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB"] tokens = en_tokenizer(text) doc = get_doc( - tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags + tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos, ) with doc.retokenize() as retokenizer: for np in doc.noun_chunks: diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py new file mode 100644 index 000000000..644fa0f01 --- /dev/null +++ b/spacy/tests/pipeline/test_lemmatizer.py @@ -0,0 +1,109 @@ +import pytest + +from spacy import util, registry +from spacy.lang.en import English +from spacy.lookups import Lookups, load_lookups + +from ..util import make_tempdir + + +@pytest.fixture +def nlp(): + return English() + + +@pytest.fixture +def lemmatizer(nlp): + @registry.assets("cope_lookups") + def cope_lookups(): + lookups = Lookups() + lookups.add_table("lemma_lookup", {"cope": "cope"}) + lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) + lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) + lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) + return lookups + + lemmatizer = nlp.add_pipe( + "lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}} + ) + return lemmatizer + + +def test_lemmatizer_init(nlp): + @registry.assets("cope_lookups") + def cope_lookups(): + lookups = Lookups() + lookups.add_table("lemma_lookup", {"cope": "cope"}) + lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) + lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) + lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) + return lookups + + lemmatizer = nlp.add_pipe( + "lemmatizer", config={"mode": "lookup", "lookups": {"@assets": "cope_lookups"}} + ) + assert isinstance(lemmatizer.lookups, Lookups) + assert lemmatizer.mode == "lookup" + # replace any tables from spacy-lookups-data + lemmatizer.lookups = Lookups() + doc = nlp("coping") + # lookup with no tables sets text as lemma + assert doc[0].lemma_ == "coping" + + nlp.remove_pipe("lemmatizer") + + @registry.assets("empty_lookups") + def empty_lookups(): + return Lookups() + + with pytest.raises(ValueError): + nlp.add_pipe( + "lemmatizer", + config={"mode": "lookup", "lookups": {"@assets": "empty_lookups"}}, + ) + + +def test_lemmatizer_config(nlp, lemmatizer): + doc = nlp.make_doc("coping") + doc[0].pos_ = "VERB" + assert doc[0].lemma_ == "" + doc = lemmatizer(doc) + assert doc[0].text == "coping" + assert doc[0].lemma_ == "cope" + + doc = nlp.make_doc("coping") + doc[0].pos_ = "VERB" + assert doc[0].lemma_ == "" + doc = lemmatizer(doc) + assert doc[0].text == "coping" + assert doc[0].lemma_ == "cope" + + +def test_lemmatizer_serialize(nlp, lemmatizer): + @registry.assets("cope_lookups") + def cope_lookups(): + lookups = Lookups() + lookups.add_table("lemma_lookup", {"cope": "cope"}) + lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) + lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) + lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) + return lookups + + nlp2 = English() + lemmatizer2 = nlp2.add_pipe( + "lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}} + ) + lemmatizer2.from_bytes(lemmatizer.to_bytes()) + assert lemmatizer.to_bytes() == lemmatizer2.to_bytes() + assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables + + # Also test the results are still the same after IO + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + doc2 = nlp2.make_doc("coping") + doc2[0].pos_ = "VERB" + assert doc2[0].lemma_ == "" + doc2 = lemmatizer(doc2) + assert doc2[0].text == "coping" + assert doc2[0].lemma_ == "cope" diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index dd6739e17..5f27a0afa 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -23,13 +23,12 @@ def test_tagger_begin_training_tag_map(): nlp = Language() tagger = nlp.add_pipe("tagger") orig_tag_count = len(tagger.labels) - tagger.add_label("A", {"POS": "NOUN"}) + tagger.add_label("A") nlp.begin_training() - assert nlp.vocab.morphology.tag_map["A"] == {POS: NOUN} assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels) -TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}} +TAGS = ("N", "V", "J") MORPH_RULES = {"V": {"like": {"lemma": "luck"}}} @@ -42,15 +41,12 @@ TRAIN_DATA = [ def test_overfitting_IO(): # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly nlp = English() - nlp.vocab.morphology.load_tag_map(TAG_MAP) - nlp.vocab.morphology.load_morph_exceptions(MORPH_RULES) - tagger = nlp.add_pipe("tagger", config={"set_morphology": True}) - nlp.vocab.morphology.load_tag_map(TAG_MAP) + tagger = nlp.add_pipe("tagger") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - for tag, values in TAG_MAP.items(): - tagger.add_label(tag, values) + for tag in TAGS: + tagger.add_label(tag) optimizer = nlp.begin_training() for i in range(50): @@ -65,7 +61,6 @@ def test_overfitting_IO(): assert doc[1].tag_ is "V" assert doc[2].tag_ is "J" assert doc[3].tag_ is "N" - assert doc[1].lemma_ == "luck" # Also test the results are still the same after IO with make_tempdir() as tmp_dir: @@ -76,4 +71,3 @@ def test_overfitting_IO(): assert doc2[1].tag_ is "V" assert doc2[2].tag_ is "J" assert doc2[3].tag_ is "N" - assert doc[1].lemma_ == "luck" diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index c1b83c6c4..b642ca229 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -8,10 +8,8 @@ from spacy.attrs import IS_PUNCT, ORTH, LOWER from spacy.symbols import POS, VERB from spacy.vocab import Vocab from spacy.lang.en import English -from spacy.lemmatizer import Lemmatizer from spacy.lookups import Lookups from spacy.tokens import Doc, Span -from spacy.lang.en.lemmatizer import is_base_form from ..util import get_doc, make_tempdir @@ -157,16 +155,15 @@ def test_issue590(en_vocab): assert len(matches) == 2 +@pytest.mark.skip(reason="Old vocab-based lemmatization") def test_issue595(): """Test lemmatization of base forms""" words = ["Do", "n't", "feed", "the", "dog"] - tag_map = {"VB": {POS: VERB, "VerbForm": "inf"}} lookups = Lookups() lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]}) lookups.add_table("lemma_index", {"verb": {}}) lookups.add_table("lemma_exc", {"verb": {}}) - lemmatizer = Lemmatizer(lookups, is_base_form=is_base_form) - vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) + vocab = Vocab() doc = Doc(vocab, words=words) doc[2].tag_ = "VB" assert doc[2].text == "feed" @@ -389,6 +386,7 @@ def test_issue891(en_tokenizer, text): assert tokens[1].text == "/" +@pytest.mark.skip(reason="Old vocab-based lemmatization") @pytest.mark.parametrize( "text,tag,lemma", [("anus", "NN", "anus"), ("princess", "NN", "princess"), ("inner", "JJ", "inner")], diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py index d612150de..0ac895546 100644 --- a/spacy/tests/regression/test_issue1001-1500.py +++ b/spacy/tests/regression/test_issue1001-1500.py @@ -6,7 +6,6 @@ from spacy.lang.en import English from spacy.lang.lex_attrs import LEX_ATTRS from spacy.matcher import Matcher from spacy.tokenizer import Tokenizer -from spacy.lemmatizer import Lemmatizer from spacy.lookups import Lookups from spacy.symbols import ORTH, LEMMA, POS, VERB @@ -57,6 +56,7 @@ def test_issue1242(): assert len(docs[1]) == 1 +@pytest.mark.skip(reason="v3 no longer supports LEMMA/POS in tokenizer special cases") def test_issue1250(): """Test cached special cases.""" special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}] @@ -87,20 +87,6 @@ def test_issue1375(): assert doc[1].nbor(1).text == "2" -def test_issue1387(): - tag_map = {"VBG": {POS: VERB, "VerbForm": "part"}} - lookups = Lookups() - lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) - lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) - lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) - lemmatizer = Lemmatizer(lookups) - vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) - doc = Doc(vocab, words=["coping"]) - doc[0].tag_ = "VBG" - assert doc[0].text == "coping" - assert doc[0].lemma_ == "cope" - - def test_issue1434(): """Test matches occur when optional element at end of short doc.""" pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}] diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index b5d586ec6..83afb11f3 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -130,8 +130,6 @@ def test_issue1727(): vectors = Vectors(data=data, keys=["I", "am", "Matt"]) tagger = nlp.create_pipe("tagger") tagger.add_label("PRP") - with pytest.warns(UserWarning): - tagger.begin_training() assert tagger.cfg.get("pretrained_dims", 0) == 0 tagger.vocab.vectors = vectors with make_tempdir() as path: diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index cf4e402e2..cf43e1a17 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -19,8 +19,8 @@ def test_issue2564(): """Test the tagger sets is_tagged correctly when used via Language.pipe.""" nlp = Language() tagger = nlp.add_pipe("tagger") - with pytest.warns(UserWarning): - tagger.begin_training() # initialise weights + tagger.add_label("A") + tagger.begin_training() doc = nlp("hello world") assert doc.is_tagged docs = nlp.pipe(["hello", "world"]) diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index e93c27a59..98a6b9aa0 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -241,11 +241,11 @@ def test_issue3449(): assert t3[5].text == "I" -@pytest.mark.filterwarnings("ignore::UserWarning") def test_issue3456(): # this crashed because of a padding error in layer.ops.unflatten in thinc nlp = English() - nlp.add_pipe("tagger") + tagger = nlp.add_pipe("tagger") + tagger.add_label("A") nlp.begin_training() list(nlp.pipe(["hi", ""])) diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py index 6426c6c24..e42779ad7 100644 --- a/spacy/tests/regression/test_issue3501-4000.py +++ b/spacy/tests/regression/test_issue3501-4000.py @@ -149,13 +149,15 @@ def test_issue3540(en_vocab): gold_text = ["I", "live", "in", "NewYork", "right", "now"] assert [token.text for token in doc] == gold_text gold_lemma = ["I", "live", "in", "NewYork", "right", "now"] + for i, lemma in enumerate(gold_lemma): + doc[i].lemma_ = lemma assert [token.lemma_ for token in doc] == gold_lemma vectors_1 = [token.vector for token in doc] assert len(vectors_1) == len(doc) with doc.retokenize() as retokenizer: heads = [(doc[3], 1), doc[2]] - attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]} + attrs = {"POS": ["PROPN", "PROPN"], "LEMMA": ["New", "York"], "DEP": ["pobj", "compound"]} retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs) gold_text = ["I", "live", "in", "New", "York", "right", "now"] diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py index 27464a39a..e1d03eaf5 100644 --- a/spacy/tests/regression/test_issue4001-4500.py +++ b/spacy/tests/regression/test_issue4001-4500.py @@ -271,6 +271,7 @@ def test_issue4267(): assert token.ent_iob == 2 +@pytest.mark.skip(reason="lemmatizer lookups no longer in vocab") def test_issue4272(): """Test that lookup table can be accessed from Token.lemma if no POS tags are available.""" diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 31292b700..4c6504f6b 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -62,8 +62,7 @@ def tagger(): # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization - with pytest.warns(UserWarning): - tagger.begin_training(pipeline=nlp.pipeline) + tagger.begin_training(pipeline=nlp.pipeline) return tagger diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 17d5a3a1e..7ba4815ee 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -44,8 +44,8 @@ def blank_parser(en_vocab): def taggers(en_vocab): cfg = {"model": DEFAULT_TAGGER_MODEL} model = registry.make_from_config(cfg, validate=True)["model"] - tagger1 = Tagger(en_vocab, model, set_morphology=True) - tagger2 = Tagger(en_vocab, model, set_morphology=True) + tagger1 = Tagger(en_vocab, model) + tagger2 = Tagger(en_vocab, model) return tagger1, tagger2 @@ -125,8 +125,8 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers): tagger2.to_disk(file_path2) cfg = {"model": DEFAULT_TAGGER_MODEL} model = registry.make_from_config(cfg, validate=True)["model"] - tagger1_d = Tagger(en_vocab, model, set_morphology=True).from_disk(file_path1) - tagger2_d = Tagger(en_vocab, model, set_morphology=True).from_disk(file_path2) + tagger1_d = Tagger(en_vocab, model).from_disk(file_path1) + tagger2_d = Tagger(en_vocab, model).from_disk(file_path2) assert tagger1_d.to_bytes() == tagger2_d.to_bytes() diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index 44930247a..45a546203 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -8,7 +8,6 @@ from ..util import make_tempdir test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])] test_strings_attrs = [(["rats", "are", "cute"], "Hello")] -default_strings = ("_SP", "POS=SPACE") @pytest.mark.parametrize("text", ["rat"]) @@ -34,10 +33,8 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2): assert vocab1.to_bytes() == vocab1_b new_vocab1 = Vocab().from_bytes(vocab1_b) assert new_vocab1.to_bytes() == vocab1_b - assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE - assert sorted([s for s in new_vocab1.strings]) == sorted( - strings1 + list(default_strings) - ) + assert len(new_vocab1.strings) == len(strings1) + assert sorted([s for s in new_vocab1.strings]) == sorted(strings1) @pytest.mark.parametrize("strings1,strings2", test_strings) @@ -52,16 +49,12 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2): vocab1_d = Vocab().from_disk(file_path1) vocab2_d = Vocab().from_disk(file_path2) # check strings rather than lexemes, which are only reloaded on demand - assert strings1 == [s for s in vocab1_d.strings if s not in default_strings] - assert strings2 == [s for s in vocab2_d.strings if s not in default_strings] + assert strings1 == [s for s in vocab1_d.strings] + assert strings2 == [s for s in vocab2_d.strings] if strings1 == strings2: - assert [s for s in vocab1_d.strings if s not in default_strings] == [ - s for s in vocab2_d.strings if s not in default_strings - ] + assert [s for s in vocab1_d.strings] == [s for s in vocab2_d.strings] else: - assert [s for s in vocab1_d.strings if s not in default_strings] != [ - s for s in vocab2_d.strings if s not in default_strings - ] + assert [s for s in vocab1_d.strings] != [s for s in vocab2_d.strings] @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) @@ -80,7 +73,7 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr): # Reported in #2153 vocab = Vocab(strings=strings) vocab.from_bytes(vocab.to_bytes()) - assert len(vocab.strings) == len(strings) + 2 # adds _SP and POS=SPACE + assert len(vocab.strings) == len(strings) @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py deleted file mode 100644 index 3c904cb01..000000000 --- a/spacy/tests/test_lemmatizer.py +++ /dev/null @@ -1,64 +0,0 @@ -import pytest -from spacy.tokens import Doc -from spacy.language import Language -from spacy.lookups import Lookups -from spacy.lemmatizer import Lemmatizer - - -@pytest.mark.skip(reason="We probably don't want to support this anymore in v3?") -def test_lemmatizer_reflects_lookups_changes(): - """Test for an issue that'd cause lookups available in a model loaded from - disk to not be reflected in the lemmatizer.""" - nlp = Language() - assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "foo" - table = nlp.vocab.lookups.add_table("lemma_lookup") - table["foo"] = "bar" - assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "bar" - table = nlp.vocab.lookups.get_table("lemma_lookup") - table["hello"] = "world" - # The update to the table should be reflected in the lemmatizer - assert Doc(nlp.vocab, words=["hello"])[0].lemma_ == "world" - new_nlp = Language() - table = new_nlp.vocab.lookups.add_table("lemma_lookup") - table["hello"] = "hi" - assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "hi" - nlp_bytes = nlp.to_bytes() - new_nlp.from_bytes(nlp_bytes) - # Make sure we have the previously saved lookup table - assert "lemma_lookup" in new_nlp.vocab.lookups - assert len(new_nlp.vocab.lookups.get_table("lemma_lookup")) == 2 - assert new_nlp.vocab.lookups.get_table("lemma_lookup")["hello"] == "world" - assert Doc(new_nlp.vocab, words=["foo"])[0].lemma_ == "bar" - assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world" - - -def test_tagger_warns_no_lookups(): - nlp = Language() - nlp.vocab.lookups = Lookups() - assert not len(nlp.vocab.lookups) - tagger = nlp.add_pipe("tagger") - with pytest.warns(UserWarning): - tagger.begin_training() - with pytest.warns(UserWarning): - nlp.begin_training() - nlp.vocab.lookups.add_table("lemma_lookup") - nlp.vocab.lookups.add_table("lexeme_norm") - nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" - with pytest.warns(None) as record: - nlp.begin_training() - assert not record.list - - -def test_lemmatizer_without_is_base_form_implementation(): - # Norwegian example from #5658 - lookups = Lookups() - lookups.add_table("lemma_rules", {"noun": []}) - lookups.add_table("lemma_index", {"noun": {}}) - lookups.add_table("lemma_exc", {"noun": {"formuesskatten": ["formuesskatt"]}}) - - lemmatizer = Lemmatizer(lookups, is_base_form=None) - assert lemmatizer( - "Formuesskatten", - "noun", - {"Definite": "def", "Gender": "masc", "Number": "sing"}, - ) == ["formuesskatt"] diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index c035559b4..b89c0627f 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -112,16 +112,15 @@ def test_tokenizer_validate_special_case(tokenizer, text, tokens): @pytest.mark.parametrize( - "text,tokens", [("lorem", [{"orth": "lo", "tag": "NN"}, {"orth": "rem"}])] + "text,tokens", [("lorem", [{"orth": "lo", "norm": "LO"}, {"orth": "rem"}])] ) def test_tokenizer_add_special_case_tag(text, tokens): - vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}}) + vocab = Vocab() tokenizer = Tokenizer(vocab, {}, None, None, None) tokenizer.add_special_case(text, tokens) doc = tokenizer(text) assert doc[0].text == tokens[0]["orth"] - assert doc[0].tag_ == tokens[0]["tag"] - assert doc[0].pos_ == "NOUN" + assert doc[0].norm_ == tokens[0]["norm"] assert doc[1].text == tokens[1]["orth"] diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 61f7c3db0..8d57b791f 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -11,7 +11,7 @@ from .span cimport Span from .token cimport Token from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..structs cimport LexemeC, TokenC -from ..attrs cimport TAG, MORPH +from ..attrs cimport MORPH from ..vocab cimport Vocab from .underscore import is_writable_attr @@ -365,8 +365,6 @@ def _split(Doc doc, int token_index, orths, heads, attrs): doc[token_index + i]._.set(ext_attr_key, ext_attr_value) # NB: We need to call get_string_id here because only the keys are # "intified" (since we support "KEY": [value, value] syntax here). - elif attr_name == TAG: - doc.vocab.morphology.assign_tag(token, get_string_id(attr_value)) else: # Set attributes on both token and lexeme to take care of token # attribute vs. lexical attribute without having to enumerate @@ -431,8 +429,6 @@ def set_token_attrs(Token py_token, attrs): if attr_name == "_": # Set extension attributes for ext_attr_key, ext_attr_value in attr_value.items(): py_token._.set(ext_attr_key, ext_attr_value) - elif attr_name == TAG: - doc.vocab.morphology.assign_tag(token, attr_value) else: # Set attributes on both token and lexeme to take care of token # attribute vs. lexical attribute without having to enumerate diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 935af88d1..15dafb86d 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -832,13 +832,6 @@ cdef class Doc: rel_head_index=abs_head_index-i ) ) - # Do TAG first. This lets subsequent loop override stuff like POS, LEMMA - if TAG in attrs: - col = attrs.index(TAG) - for i in range(length): - value = values[col * stride + i] - if value != 0: - self.vocab.morphology.assign_tag(&tokens[i], value) # Verify ENT_IOB are proper integers if ENT_IOB in attrs: iob_strings = Token.iob_strings() @@ -857,12 +850,11 @@ cdef class Doc: for i in range(length): token = &self.c[i] for j in range(n_attrs): - if attr_ids[j] != TAG: - value = values[j * stride + i] - if attr_ids[j] == MORPH: - # add morph to morphology table - self.vocab.morphology.add(self.vocab.strings[value]) - Token.set_struct_attr(token, attr_ids[j], value) + value = values[j * stride + i] + if attr_ids[j] == MORPH: + # add morph to morphology table + self.vocab.morphology.add(self.vocab.strings[value]) + Token.set_struct_attr(token, attr_ids[j], value) # Set flags self.is_parsed = bool(self.is_parsed or HEAD in attrs) self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index a187c9722..9ad57e21b 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -332,11 +332,7 @@ cdef class Token: inflectional suffixes. """ def __get__(self): - if self.c.lemma == 0: - lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth) - return self.vocab.strings[lemma_] - else: - return self.c.lemma + return self.c.lemma def __set__(self, attr_t lemma): self.c.lemma = lemma @@ -355,7 +351,7 @@ cdef class Token: return self.c.tag def __set__(self, attr_t tag): - self.vocab.morphology.assign_tag(self.c, tag) + self.c.tag = tag property dep: """RETURNS (uint64): ID of syntactic dependency label.""" @@ -888,10 +884,7 @@ cdef class Token: with no inflectional suffixes. """ def __get__(self): - if self.c.lemma == 0: - return self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth) - else: - return self.vocab.strings[self.c.lemma] + return self.vocab.strings[self.c.lemma] def __set__(self, unicode lemma_): self.c.lemma = self.vocab.strings.add(lemma_) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index ce95786f2..9e14f37d2 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -9,11 +9,10 @@ from .lexeme cimport EMPTY_LEXEME, OOV_RANK from .lexeme cimport Lexeme from .typedefs cimport attr_t from .tokens.token cimport Token -from .attrs cimport LANG, ORTH, TAG, POS +from .attrs cimport LANG, ORTH from .compat import copy_reg from .errors import Errors -from .lemmatizer import Lemmatizer from .attrs import intify_attrs, NORM, IS_STOP from .vectors import Vectors from .util import registry @@ -23,7 +22,7 @@ from .lang.norm_exceptions import BASE_NORMS from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang -def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_data=True): +def create_vocab(lang, defaults, vectors_name=None, load_data=True): # If the spacy-lookups-data package is installed, we pre-populate the lookups # with lexeme data, if available if load_data: @@ -43,7 +42,6 @@ def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_data=T ) return Vocab( lex_attr_getters=lex_attrs, - lemmatizer=lemmatizer, lookups=lookups, writing_system=defaults.writing_system, get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"), @@ -58,17 +56,13 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab """ - def __init__(self, lex_attr_getters=None, lemmatizer=None, - strings=tuple(), lookups=None, tag_map={}, + def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None, oov_prob=-20., vectors_name=None, writing_system={}, get_noun_chunks=None, **deprecated_kwargs): """Create the vocabulary. lex_attr_getters (dict): A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. - tag_map (dict): Dictionary mapping fine-grained tags to coarse-grained - parts-of-speech, and optionally morphological attributes. - lemmatizer (object): A lemmatizer. Defaults to `None`. strings (StringStore): StringStore that maps strings to integers, and vice versa. lookups (Lookups): Container for large lookup tables and dictionaries. @@ -78,8 +72,6 @@ cdef class Vocab: lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} if lookups in (None, True, False): lookups = Lookups() - if lemmatizer in (None, True, False): - lemmatizer = Lemmatizer(lookups) self.cfg = {'oov_prob': oov_prob} self.mem = Pool() self._by_orth = PreshMap() @@ -89,7 +81,7 @@ cdef class Vocab: for string in strings: _ = self[string] self.lex_attr_getters = lex_attr_getters - self.morphology = Morphology(self.strings, tag_map, lemmatizer) + self.morphology = Morphology(self.strings) self.vectors = Vectors(name=vectors_name) self.lookups = lookups self.writing_system = writing_system @@ -268,12 +260,6 @@ cdef class Vocab: # Set the special tokens up to have arbitrary attributes lex = self.get_by_orth(self.mem, props[ORTH]) token.lex = lex - if TAG in props: - self.morphology.assign_tag(token, props[TAG]) - elif POS in props: - # Don't allow POS to be set without TAG -- this causes problems, - # see #1773 - props.pop(POS) for attr_id, value in props.items(): Token.set_struct_attr(token, attr_id, value) # NORM is the only one that overlaps between the two diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index 73f8aa71f..b6a9c80b5 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -1,102 +1,263 @@ --- title: Lemmatizer -teaser: Assign the base forms of words tag: class -source: spacy/lemmatizer.py +source: spacy/pipeline/lemmatizer.py +new: 3 +teaser: 'Pipeline component for lemmatization' +api_base_class: /api/pipe +api_string_name: lemmatizer +api_trainable: false --- - +## Config and implementation -The `Lemmatizer` supports simple part-of-speech-sensitive suffix rules and -lookup tables. +The default config is defined by the pipeline component factory and describes +how the component should be configured. You can override its settings via the +`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your +[`config.cfg` for training](/usage/training#config). + +For examples of the lookups data formats used by the lookup and rule-based +lemmatizers, see the +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) repo. + +> #### Example +> +> ```python +> config = {"mode": "rule"} +> nlp.add_pipe("lemmatizer", config=config) +> ``` + +| Setting | Type | Description | Default | +| ----------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ---------- | +| `mode` | str | The lemmatizer mode, e.g. "lookup" or "rule". | `"lookup"` | +| `lookups` | [`Lookups`](/api/lookups) | The lookups object containing the tables such as "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". If `None`, default tables are loaded from `spacy-lookups-data`. | `None` | +| `overwrite` | bool | Whether to overwrite existing lemmas. | `False` | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Not yet implemented:** the model to use. | `None` | + +```python +https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/lemmatizer.py +``` ## Lemmatizer.\_\_init\_\_ {#init tag="method"} -Initialize a `Lemmatizer`. Typically, this happens under the hood within spaCy -when a `Language` subclass and its `Vocab` is initialized. - > #### Example > > ```python -> from spacy.lemmatizer import Lemmatizer -> from spacy.lookups import Lookups -> lookups = Lookups() -> lookups.add_table("lemma_rules", {"noun": [["s", ""]]}) -> lemmatizer = Lemmatizer(lookups) -> ``` +> # Construction via add_pipe with default model +> lemmatizer = nlp.add_pipe("lemmatizer") > -> For examples of the data format, see the -> [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) repo. +> # Construction via add_pipe with custom settings +> config = {"mode": "rule", overwrite=True} +> lemmatizer = nlp.add_pipe("lemmatizer", config=config) +> ``` -| Name | Type | Description | -| -------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------- | -| `lookups` 2.2 | [`Lookups`](/api/lookups) | The lookups object containing the (optional) tables `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. | +Create a new pipeline instance. In your application, you would normally use a +shortcut for this and instantiate the component using its string name and +[`nlp.add_pipe`](/api/language#add_pipe). + +| Name | Type | Description | +| -------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | [`Vocab`](/api/vocab) | The vocab. | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model (not yet implemented). | +| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | +| _keyword-only_ | | | +| mode | str | The lemmatizer mode, e.g. "lookup" or "rule". Defaults to "lookup". | +| lookups | [`Lookups`](/api/lookups) | A lookups object containing the tables such as "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". Defaults to `None`. | +| overwrite | bool | Whether to overwrite existing lemmas. | ## Lemmatizer.\_\_call\_\_ {#call tag="method"} -Lemmatize a string. +Apply the pipe to one document. The document is modified in place, and returned. +This usually happens under the hood when the `nlp` object is called on a text +and all pipeline components are applied to the `Doc` in order. > #### Example > > ```python -> from spacy.lemmatizer import Lemmatizer -> from spacy.lookups import Lookups -> lookups = Lookups() -> lookups.add_table("lemma_rules", {"noun": [["s", ""]]}) -> lemmatizer = Lemmatizer(lookups) -> lemmas = lemmatizer("ducks", "NOUN") -> assert lemmas == ["duck"] +> doc = nlp("This is a sentence.") +> lemmatizer = nlp.add_pipe("lemmatizer") +> # This usually happens under the hood +> processed = lemmatizer(doc) > ``` -| Name | Type | Description | -| ------------ | ------------- | -------------------------------------------------------------------------------------------------------- | -| `string` | str | The string to lemmatize, e.g. the token text. | -| `univ_pos` | str / int | The token's universal part-of-speech tag. | -| `morphology` | dict / `None` | Morphological features following the [Universal Dependencies](http://universaldependencies.org/) scheme. | -| **RETURNS** | list | The available lemmas for the string. | +| Name | Type | Description | +| ----------- | ----- | ------------------------ | +| `doc` | `Doc` | The document to process. | +| **RETURNS** | `Doc` | The processed document. | -## Lemmatizer.lookup {#lookup tag="method" new="2"} +## Lemmatizer.pipe {#pipe tag="method"} -Look up a lemma in the lookup table, if available. If no lemma is found, the +Apply the pipe to a stream of documents. This usually happens under the hood +when the `nlp` object is called on a text and all pipeline components are +applied to the `Doc` in order. + +> #### Example +> +> ```python +> lemmatizer = nlp.add_pipe("lemmatizer") +> for doc in lemmatizer.pipe(docs, batch_size=50): +> pass +> ``` + +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------ | +| `stream` | `Iterable[Doc]` | A stream of documents. | +| _keyword-only_ | | | +| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | +| **YIELDS** | `Doc` | Processed documents in the order of the original text. | + +## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"} + +Lemmatize a token using a lookup-based approach. If no lemma is found, the original string is returned. Languages can provide a [lookup table](/usage/adding-languages#lemmatizer) via the `Lookups`. -> #### Example -> -> ```python -> lookups = Lookups() -> lookups.add_table("lemma_lookup", {"going": "go"}) -> assert lemmatizer.lookup("going") == "go" -> ``` +| Name | Type | Description | +| ----------- | --------------------- | ------------------------------------- | +| `token` | [`Token`](/api/token) | The token to lemmatize. | +| **RETURNS** | `List[str]` | A list containing one or more lemmas. | -| Name | Type | Description | -| ----------- | ---- | ----------------------------------------------------------------------------------------------------------- | -| `string` | str | The string to look up. | -| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. | -| **RETURNS** | str | The lemma if the string was found, otherwise the original string. | +## Lemmatizer.rule_lemmatize {#rule_lemmatize tag="method"} + +Lemmatize a token using a rule-based approach. Typically relies on POS tags. + +| Name | Type | Description | +| ----------- | --------------------- | ------------------------------------- | +| `token` | [`Token`](/api/token) | The token to lemmatize. | +| **RETURNS** | `List[str]` | A list containing one or more lemmas. | ## Lemmatizer.is_base_form {#is_base_form tag="method"} Check whether we're dealing with an uninflected paradigm, so we can avoid lemmatization entirely. +| Name | Type | Description | +| ----------- | --------------------- | ------------------------------------------------------------------------------------------------------- | +| `token` | [`Token`](/api/token) | The token to analyze. | +| **RETURNS** | bool | Whether the token's attributes (e.g., part-of-speech tag, morphological features) describe a base form. | + +## Lemmatizer.get_lookups_config {#get_lookups_config tag="classmethod"} + +Returns the lookups configuration settings for a given mode for use in +[`Lemmatizer.load_lookups`](#load_lookups). + +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------------- | +| `mode` | str | The lemmatizer mode. | +| **RETURNS** | dict | The lookups configuration settings for this mode. | + +## Lemmatizer.load_lookups {#load_lookups tag="classmethod"} + +Load and validate lookups tables. If the provided lookups is `None`, load the +default lookups tables according to the language and mode settings. Confirm that +all required tables for the language and mode are present. + +| Name | Type | Description | +| ----------- | ------------------------- | ---------------------------------------------------------------------------- | +| `lang` | str | The language. | +| `mode` | str | The lemmatizer mode. | +| `lookups` | [`Lookups`](/api/lookups) | The provided lookups, may be `None` if the default lookups should be loaded. | +| **RETURNS** | [`Lookups`](/api/lookups) | The lookups object. | + +## Lemmatizer.to_disk {#to_disk tag="method"} + +Serialize the pipe to disk. + > #### Example > > ```python -> pos = "verb" -> morph = {"VerbForm": "inf"} -> is_base_form = lemmatizer.is_base_form(pos, morph) -> assert is_base_form == True +> lemmatizer = nlp.add_pipe("lemmatizer") +> lemmatizer.to_disk("/path/to/lemmatizer") > ``` -| Name | Type | Description | -| ------------ | --------- | --------------------------------------------------------------------------------------- | -| `univ_pos` | str / int | The token's universal part-of-speech tag. | -| `morphology` | dict | The token's morphological features. | -| **RETURNS** | bool | Whether the token's part-of-speech tag and morphological features describe a base form. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | + +## Lemmatizer.from_disk {#from_disk tag="method"} + +Load the pipe from disk. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> lemmatizer = nlp.add_pipe("lemmatizer") +> lemmatizer.from_disk("/path/to/lemmatizer") +> ``` + +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Lemmatizer` | The modified `Lemmatizer` object. | + +## Lemmatizer.to_bytes {#to_bytes tag="method"} + +> #### Example +> +> ```python +> lemmatizer = nlp.add_pipe("lemmatizer") +> lemmatizer_bytes = lemmatizer.to_bytes() +> ``` + +Serialize the pipe to a bytestring. + +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Lemmatizer` object. | + +## Lemmatizer.from_bytes {#from_bytes tag="method"} + +Load the pipe from a bytestring. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> lemmatizer_bytes = lemmatizer.to_bytes() +> lemmatizer = nlp.add_pipe("lemmatizer") +> lemmatizer.from_bytes(lemmatizer_bytes) +> ``` + +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Lemmatizer` | The `Lemmatizer` object. | + +## Lemmatizer.mode {#mode tag="property"} + +The lemmatizer mode. + +| Name | Type | Description | +| ----------- | ----- | -------------------- | +| **RETURNS** | `str` | The lemmatizer mode. | ## Attributes {#attributes} -| Name | Type | Description | -| -------------------------------------- | ------------------------- | --------------------------------------------------------------- | -| `lookups` 2.2 | [`Lookups`](/api/lookups) | The lookups object containing the rules and data, if available. | +| Name | Type | Description | +| --------- | --------------------------------- | ------------------- | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `lookups` | [`Lookups`](/api/lookups) | The lookups object. | + +## Serialization fields {#serialization-fields} + +During serialization, spaCy will export several data fields used to restore +different aspects of the object. If needed, you can exclude them from +serialization by passing in the string names via the `exclude` argument. + +> #### Example +> +> ```python +> data = lemmatizer.to_disk("/path", exclude=["vocab"]) +> ``` + +| Name | Description | +| --------- | ---------------------------------------------------- | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `lookups` | The lookups. You usually don't want to exclude this. | diff --git a/website/docs/api/morphology.md b/website/docs/api/morphology.md index 8fb89c15f..3c5bf6fe4 100644 --- a/website/docs/api/morphology.md +++ b/website/docs/api/morphology.md @@ -11,22 +11,19 @@ this class. ## Morphology.\_\_init\_\_ {#init tag="method"} -Create a Morphology object using the tag map, lemmatizer and exceptions. +Create a Morphology object. > #### Example > > ```python > from spacy.morphology import Morphology > -> morphology = Morphology(strings, tag_map, lemmatizer) +> morphology = Morphology(strings) > ``` -| Name | Type | Description | -| ------------ | ----------------- | ---------------------------------------------------------------------------------------------------------- | -| `strings` | `StringStore` | The string store. | -| `tag_map` | `Dict[str, Dict]` | The tag map. | -| `lemmatizer` | `Lemmatizer` | The lemmatizer. | -| `exc` | `Dict[str, Dict]` | A dictionary of exceptions in the format `{tag: {orth: {"POS": "X", "Feat1": "Val1, "Feat2": "Val2", ...}` | +| Name | Type | Description | +| --------- | ------------- | ----------------- | +| `strings` | `StringStore` | The string store. | ## Morphology.add {#add tag="method"} @@ -62,52 +59,6 @@ Get the FEATS string for the hash of the morphological analysis. | ------- | ---- | --------------------------------------- | | `morph` | int | The hash of the morphological analysis. | -## Morphology.load_tag_map {#load_tag_map tag="method"} - -Replace the current tag map with the provided tag map. - -| Name | Type | Description | -| --------- | ----------------- | ------------ | -| `tag_map` | `Dict[str, Dict]` | The tag map. | - -## Morphology.load_morph_exceptions {#load_morph_exceptions tag="method"} - -Replace the current morphological exceptions with the provided exceptions. - -| Name | Type | Description | -| ------------- | ----------------- | ----------------------------- | -| `morph_rules` | `Dict[str, Dict]` | The morphological exceptions. | - -## Morphology.add_special_case {#add_special_case tag="method"} - -Add a special-case rule to the morphological analyzer. Tokens whose tag and orth -match the rule will receive the specified properties. - -> #### Example -> -> ```python -> attrs = {"POS": "DET", "Definite": "Def"} -> morphology.add_special_case("DT", "the", attrs) -> ``` - -| Name | Type | Description | -| ---------- | ---- | ---------------------------------------------- | -| `tag_str` | str | The fine-grained tag. | -| `orth_str` | str | The token text. | -| `attrs` | dict | The features to assign for this token and tag. | - -## Morphology.exc {#exc tag="property"} - -The current morphological exceptions. - -| Name | Type | Description | -| ---------- | ---- | --------------------------------------------------- | -| **YIELDS** | dict | The current dictionary of morphological exceptions. | - -## Morphology.lemmatize {#lemmatize tag="method"} - -TODO - ## Morphology.feats_to_dict {#feats_to_dict tag="staticmethod"} Convert a string FEATS representation to a dictionary of features and values in diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index 37ef13453..d9b8f4caf 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -47,7 +47,7 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tagger.pyx > > # Construction via create_pipe with custom model > config = {"model": {"@architectures": "my_tagger"}} -> parser = nlp.add_pipe("tagger", config=config) +> tagger = nlp.add_pipe("tagger", config=config) > > # Construction from class > from spacy.pipeline import Tagger @@ -285,16 +285,14 @@ Add a new label to the pipe. > #### Example > > ```python -> from spacy.symbols import POS > tagger = nlp.add_pipe("tagger") -> tagger.add_label("MY_LABEL", {POS: "NOUN"}) +> tagger.add_label("MY_LABEL") > ``` -| Name | Type | Description | -| ----------- | ---------------- | --------------------------------------------------------------- | -| `label` | str | The label to add. | -| `values` | `Dict[int, str]` | Optional values to map to the label, e.g. a tag map dictionary. | -| **RETURNS** | int | `0` if the label is already present, otherwise `1`. | +| Name | Type | Description | +| ----------- | ---- | --------------------------------------------------- | +| `label` | str | The label to add. | +| **RETURNS** | int | `0` if the label is already present, otherwise `1`. | ## Tagger.to_disk {#to_disk tag="method"} @@ -369,9 +367,7 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. ## Tagger.labels {#labels tag="property"} -The labels currently added to the component. Note that even for a blank -component, this will always include the built-in coarse-grained part-of-speech -tags by default, e.g. `VERB`, `NOUN` and so on. +The labels currently added to the component. > #### Example > @@ -396,9 +392,8 @@ serialization by passing in the string names via the `exclude` argument. > data = tagger.to_disk("/path", exclude=["vocab"]) > ``` -| Name | Description | -| --------- | ------------------------------------------------------------------------------------------ | -| `vocab` | The shared [`Vocab`](/api/vocab). | -| `cfg` | The config file. You usually don't want to exclude this. | -| `model` | The binary model data. You usually don't want to exclude this. | -| `tag_map` | The [tag map](/usage/adding-languages#tag-map) mapping fine-grained to coarse-grained tag. | +| Name | Description | +| ------- | -------------------------------------------------------------- | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `cfg` | The config file. You usually don't want to exclude this. | +| `model` | The binary model data. You usually don't want to exclude this. | diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index d5c9b0ff0..7e77762bb 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -24,8 +24,6 @@ Create the vocabulary. | Name | Type | Description | | -------------------------------------------- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | | `lex_attr_getters` | dict | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. | -| `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. | -| `lemmatizer` | object | A lemmatizer. Defaults to `None`. | | `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. | | `lookups` | `Lookups` | A [`Lookups`](/api/lookups) that stores the `lemma_\*`, `lexeme_norm` and other large lookup tables. Defaults to `None`. | | `lookups_extra` 2.3 | `Lookups` | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. | From e829d3bf14fa2f6b15ef2a0ef4fa0fa7a37b49ad Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 7 Aug 2020 15:46:20 +0200 Subject: [PATCH 07/10] Update docs [ci skip] --- website/docs/usage/training.md | 66 ++++++++++++++++++++++++++++++++-- website/src/styles/layout.sass | 2 +- 2 files changed, 64 insertions(+), 4 deletions(-) diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 78dfc3676..ef69c302c 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -114,8 +114,9 @@ Some of the main advantages and features of spaCy's training config are: passed into them. You can also register your own functions to define [custom architectures](#custom-models), reference them in your config and tweak their parameters. -- **Interpolation.** If you have hyperparameters used by multiple components, - define them once and reference them as variables. +- **Interpolation.** If you have hyperparameters or other settings used by + multiple components, define them once and reference them as + [variables](#config-interpolation). - **Reproducibility with no hidden defaults.** The config file is the "single source of truth" and includes all settings. - **Automated checks and validation.** When you load a config, spaCy checks if @@ -307,7 +308,66 @@ compound = 1.001 ### Using variable interpolation {#config-interpolation} - +Another very useful feature of the config system is that it supports variable +interpolation for both **values and sections**. This means that you only need to +define a setting once and can reference it across your config using the +`${section:value}` or `${section.block}` syntax. In this example, the value of +`seed` is reused within the `[training]` block, and the whole block of +`[training.optimizer]` is reused in `[pretraining]` and will become +`pretraining.optimizer`. + +> #### Note on syntax +> +> There are two different ways to format your variables, depending on whether +> you want to reference a single value or a block. Values are specified after a +> `:`, while blocks are specified with a `.`: +> +> 1. `${section:value}`, `${section.subsection:value}` +> 2. `${section.block}`, `${section.subsection.block}` + +```ini +### config.cfg (excerpt) {highlight="5,18"} +[system] +seed = 0 + +[training] +seed = ${system:seed} + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 1e-8 + +[pretraining] +optimizer = ${training.optimizer} +``` + +You can also use variables inside strings. In that case, it works just like +f-strings in Python. If the value of a variable is not a string, it's converted +to a string. + +```ini +[paths] +version = 5 +root = "/Users/you/data" +train = "${paths:root}/train_${paths:version}.spacy" +# Result: /Users/you/data/train_5.spacy +``` + + + +If you need to change certain values between training runs, you can define them +once, reference them as variables and then [override](#config-overrides) them on +the CLI. For example, `--paths.root /other/root` will change the value of `root` +in the block `[paths]` and the change will be reflected across all other values +that reference this variable. + + ### Model architectures {#model-architectures} diff --git a/website/src/styles/layout.sass b/website/src/styles/layout.sass index 9660363dd..3591fb005 100644 --- a/website/src/styles/layout.sass +++ b/website/src/styles/layout.sass @@ -373,7 +373,7 @@ body [id]:target margin-right: -1.5em margin-left: -1.5em padding-right: 1.5em - padding-left: 1.1em + padding-left: 1.25em &:empty:before // Fix issue where empty lines would disappear From b7e34c145192efbb4cfb0fef552d9a0b3a5b36df Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 7 Aug 2020 16:13:13 +0200 Subject: [PATCH 08/10] Update docs [ci skip] --- website/docs/api/lemmatizer.md | 12 ++++++------ website/docs/usage/101/_pipelines.md | 17 +++++++++-------- website/docs/usage/processing-pipelines.md | 5 +---- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index b6a9c80b5..6a6bb1244 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -27,12 +27,12 @@ lemmatizers, see the > nlp.add_pipe("lemmatizer", config=config) > ``` -| Setting | Type | Description | Default | -| ----------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ---------- | -| `mode` | str | The lemmatizer mode, e.g. "lookup" or "rule". | `"lookup"` | -| `lookups` | [`Lookups`](/api/lookups) | The lookups object containing the tables such as "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". If `None`, default tables are loaded from `spacy-lookups-data`. | `None` | -| `overwrite` | bool | Whether to overwrite existing lemmas. | `False` | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Not yet implemented:** the model to use. | `None` | +| Setting | Type | Description | Default | +| ----------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | +| `mode` | str | The lemmatizer mode, e.g. "lookup" or "rule". | `"lookup"` | +| `lookups` | [`Lookups`](/api/lookups) | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from `spacy-lookups-data`. | `None` | +| `overwrite` | bool | Whether to overwrite existing lemmas. | `False` | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Not yet implemented:** the model to use. | `None` | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/lemmatizer.py diff --git a/website/docs/usage/101/_pipelines.md b/website/docs/usage/101/_pipelines.md index 1ea165515..899ffa7cd 100644 --- a/website/docs/usage/101/_pipelines.md +++ b/website/docs/usage/101/_pipelines.md @@ -12,14 +12,15 @@ passed on to the next component. > - **Creates:** Objects, attributes and properties modified and set by the > component. -| Name | Component | Creates | Description | -| ------------- | ------------------------------------------------------------------ | ----------------------------------------------------------- | ------------------------------------------------ | -| **tokenizer** | [`Tokenizer`](/api/tokenizer) | `Doc` | Segment text into tokens. | -| **tagger** | [`Tagger`](/api/tagger) | `Doc[i].tag` | Assign part-of-speech tags. | -| **parser** | [`DependencyParser`](/api/dependencyparser) | `Doc[i].head`, `Doc[i].dep`, `Doc.sents`, `Doc.noun_chunks` | Assign dependency labels. | -| **ner** | [`EntityRecognizer`](/api/entityrecognizer) | `Doc.ents`, `Doc[i].ent_iob`, `Doc[i].ent_type` | Detect and label named entities. | -| **textcat** | [`TextCategorizer`](/api/textcategorizer) | `Doc.cats` | Assign document labels. | -| ... | [custom components](/usage/processing-pipelines#custom-components) | `Doc._.xxx`, `Token._.xxx`, `Span._.xxx` | Assign custom attributes, methods or properties. | +| Name | Component | Creates | Description | +| -------------- | ------------------------------------------------------------------ | --------------------------------------------------------- | ------------------------------------------------ | +| **tokenizer** | [`Tokenizer`](/api/tokenizer) | `Doc` | Segment text into tokens. | +| **tagger** | [`Tagger`](/api/tagger) | `Token.tag` | Assign part-of-speech tags. | +| **parser** | [`DependencyParser`](/api/dependencyparser) | `Token.head`, `Token.dep`, `Doc.sents`, `Doc.noun_chunks` | Assign dependency labels. | +| **ner** | [`EntityRecognizer`](/api/entityrecognizer) | `Doc.ents`, `Token.ent_iob`, `Token.ent_type` | Detect and label named entities. | +| **lemmatizer** | [`Lemmatizer`](/api/lemmatizer) | `Token.lemma` | Assign base forms. | +| **textcat** | [`TextCategorizer`](/api/textcategorizer) | `Doc.cats` | Assign document labels. | +| ... | [custom components](/usage/processing-pipelines#custom-components) | `Doc._.xxx`, `Token._.xxx`, `Span._.xxx` | Assign custom attributes, methods or properties. | The processing pipeline always **depends on the statistical model** and its capabilities. For example, a pipeline can only include an entity recognizer diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index ae1616f8b..741d19a14 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -228,16 +228,13 @@ available pipeline components and component functions. | `entity_linker` | [`EntityLinker`](/api/entitylinker) | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. | | `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules and dictionaries. | | `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories. | +| `lemmatizer` | [`Lemmatizer`](/api/lemmatizer) | Assign base forms to words. | | `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. | | `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. | | `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. | | `tok2vec` | [`Tok2Vec`](/api/tok2vec) | | | `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. | - - - - ### Disabling and modifying pipeline components {#disabling} If you don't need a particular component of the pipeline – for example, the From 5e1421e5a65b242704cddd28ba554baf7c38a91d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 7 Aug 2020 16:23:12 +0200 Subject: [PATCH 09/10] Update docs [ci skip] --- website/docs/usage/101/_pipelines.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/101/_pipelines.md b/website/docs/usage/101/_pipelines.md index 899ffa7cd..4bbc41f62 100644 --- a/website/docs/usage/101/_pipelines.md +++ b/website/docs/usage/101/_pipelines.md @@ -20,7 +20,7 @@ passed on to the next component. | **ner** | [`EntityRecognizer`](/api/entityrecognizer) | `Doc.ents`, `Token.ent_iob`, `Token.ent_type` | Detect and label named entities. | | **lemmatizer** | [`Lemmatizer`](/api/lemmatizer) | `Token.lemma` | Assign base forms. | | **textcat** | [`TextCategorizer`](/api/textcategorizer) | `Doc.cats` | Assign document labels. | -| ... | [custom components](/usage/processing-pipelines#custom-components) | `Doc._.xxx`, `Token._.xxx`, `Span._.xxx` | Assign custom attributes, methods or properties. | +| **custom** | [custom components](/usage/processing-pipelines#custom-components) | `Doc._.xxx`, `Token._.xxx`, `Span._.xxx` | Assign custom attributes, methods or properties. | The processing pipeline always **depends on the statistical model** and its capabilities. For example, a pipeline can only include an entity recognizer From 3901b088ff9574b7d4fbd926e653321d7d6d17dd Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 7 Aug 2020 17:14:13 +0200 Subject: [PATCH 10/10] Update graphics and 101 [ci skip] --- website/docs/images/language_data.svg | 85 ---- website/docs/images/tokenization.svg | 426 ++++++++++++++------ website/docs/images/training-loop.svg | 40 -- website/docs/images/vocab_stringstore.svg | 193 +++++---- website/docs/usage/101/_language-data.md | 13 +- website/docs/usage/spacy-101.md | 452 ---------------------- 6 files changed, 424 insertions(+), 785 deletions(-) delete mode 100644 website/docs/images/language_data.svg delete mode 100644 website/docs/images/training-loop.svg diff --git a/website/docs/images/language_data.svg b/website/docs/images/language_data.svg deleted file mode 100644 index 58482b2c5..000000000 --- a/website/docs/images/language_data.svg +++ /dev/null @@ -1,85 +0,0 @@ - - - - - - Tokenizer - - - - - - - - - - Base data - - - - - - - - - - - - - - - - Language data - - - - stop words - - - - lexical attributes - - - - - - tokenizer exceptions - - - - - - prefixes, suffixes, infixes - - - - - lemma data - - - - Lemmatizer - - - - char classes - - Token - - - - morph rules - - - - tag map - - Morphology - diff --git a/website/docs/images/tokenization.svg b/website/docs/images/tokenization.svg index 9877e1a30..d676fdace 100644 --- a/website/docs/images/tokenization.svg +++ b/website/docs/images/tokenization.svg @@ -1,123 +1,305 @@ - - - - - “Let’s - - - go - - - to - - - N.Y.!” - - - - - - Let’s - - - go - - - to - - - N.Y.!” - - - - - Let - - - go - - - to - - - N.Y.!” - - - ’s - - - - - - Let - - - go - - - to - - - N.Y.! - - - ’s - - - - - - - - - Let - - - go - - - to - - - N.Y. - - - ’s - - - - - - ! - - - - Let - - go - - to - - N.Y. - - ’s - - - - ! - - EXCEPTION - - PREFIX - - SUFFIX - - SUFFIX - - EXCEPTION - - DONE + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/website/docs/images/training-loop.svg b/website/docs/images/training-loop.svg deleted file mode 100644 index 144fe2d3d..000000000 --- a/website/docs/images/training-loop.svg +++ /dev/null @@ -1,40 +0,0 @@ - - - - - - - - Training data - - - - label - - - - text - - - - - - Doc - - - - Example - - - - update - - nlp - - - - optimizer - diff --git a/website/docs/images/vocab_stringstore.svg b/website/docs/images/vocab_stringstore.svg index b604041f2..e10ff3c58 100644 --- a/website/docs/images/vocab_stringstore.svg +++ b/website/docs/images/vocab_stringstore.svg @@ -1,77 +1,118 @@ - - - - - 31979... - - Lexeme - - 46904... - - Lexeme - - 37020... - - Lexeme - - - "coffee" - - 31979… - - "I" - - 46904… - - "love" - - 37020… - - - - - nsubj - - - - dobj - - String - Store - - Vocab - - Doc - - love - VERB - - Token - - I - PRON - - Token - - coffee - NOUN - - Token - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/website/docs/usage/101/_language-data.md b/website/docs/usage/101/_language-data.md index 2917b19c4..8c3cd48a3 100644 --- a/website/docs/usage/101/_language-data.md +++ b/website/docs/usage/101/_language-data.md @@ -10,8 +10,9 @@ The **shared language data** in the directory root includes rules that can be generalized across languages – for example, rules for basic punctuation, emoji, emoticons and single-letter abbreviations. The **individual language data** in a submodule contains rules that are only relevant to a particular language. It -also takes care of putting together all components and creating the `Language` -subclass – for example, `English` or `German`. +also takes care of putting together all components and creating the +[`Language`](/api/language) subclass – for example, `English` or `German`. The +values are defined in the [`Language.Defaults`](/api/language#defaults). > ```python > from spacy.lang.en import English @@ -21,14 +22,6 @@ subclass – for example, `English` or `German`. > nlp_de = German() # Includes German data > ``` - - - - | Name | Description | | ---------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | | **Stop words**
[`stop_words.py`][stop_words.py] | List of most common words of a language that are often useful to filter out, for example "and" or "I". Matching tokens will return `True` for `is_stop`. | diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index db471b1f0..27c4e3eb3 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -10,7 +10,6 @@ menu: - ['Serialization', 'serialization'] - ['Training', 'training'] - ['Language Data', 'language-data'] - - ['Lightning Tour', 'lightning-tour'] - ['Architecture', 'architecture'] - ['Community & FAQ', 'community-faq'] --- @@ -379,79 +378,6 @@ spaCy will also export the `Vocab` when you save a `Doc` or `nlp` object. This will give you the object and its encoded annotations, plus the "key" to decode it. -## Knowledge base {#kb} - -To support the entity linking task, spaCy stores external knowledge in a -[`KnowledgeBase`](/api/kb). The knowledge base (KB) uses the `Vocab` to store -its data efficiently. - -> - **Mention**: A textual occurrence of a named entity, e.g. 'Miss Lovelace'. -> - **KB ID**: A unique identifier referring to a particular real-world concept, -> e.g. 'Q7259'. -> - **Alias**: A plausible synonym or description for a certain KB ID, e.g. 'Ada -> Lovelace'. -> - **Prior probability**: The probability of a certain mention resolving to a -> certain KB ID, prior to knowing anything about the context in which the -> mention is used. -> - **Entity vector**: A pretrained word vector capturing the entity -> description. - -A knowledge base is created by first adding all entities to it. Next, for each -potential mention or alias, a list of relevant KB IDs and their prior -probabilities is added. The sum of these prior probabilities should never exceed -1 for any given alias. - -```python -### {executable="true"} -import spacy -from spacy.kb import KnowledgeBase - -# load the model and create an empty KB -nlp = spacy.load('en_core_web_sm') -kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) - -# adding entities -kb.add_entity(entity="Q1004791", freq=6, entity_vector=[0, 3, 5]) -kb.add_entity(entity="Q42", freq=342, entity_vector=[1, 9, -3]) -kb.add_entity(entity="Q5301561", freq=12, entity_vector=[-2, 4, 2]) - -# adding aliases -kb.add_alias(alias="Douglas", entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.6, 0.1, 0.2]) -kb.add_alias(alias="Douglas Adams", entities=["Q42"], probabilities=[0.9]) - -print() -print("Number of entities in KB:",kb.get_size_entities()) # 3 -print("Number of aliases in KB:", kb.get_size_aliases()) # 2 -``` - -### Candidate generation - -Given a textual entity, the knowledge base can provide a list of plausible -candidates or entity identifiers. The [`EntityLinker`](/api/entitylinker) will -take this list of candidates as input, and disambiguate the mention to the most -probable identifier, given the document context. - -```python -### {executable="true"} -import spacy -from spacy.kb import KnowledgeBase - -nlp = spacy.load('en_core_web_sm') -kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) - -# adding entities -kb.add_entity(entity="Q1004791", freq=6, entity_vector=[0, 3, 5]) -kb.add_entity(entity="Q42", freq=342, entity_vector=[1, 9, -3]) -kb.add_entity(entity="Q5301561", freq=12, entity_vector=[-2, 4, 2]) - -# adding aliases -kb.add_alias(alias="Douglas", entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.6, 0.1, 0.2]) - -candidates = kb.get_candidates("Douglas") -for c in candidates: - print(" ", c.entity_, c.prior_prob, c.entity_vector) -``` - ## Serialization {#serialization} import Serialization101 from 'usage/101/\_serialization.md' @@ -485,384 +411,6 @@ import LanguageData101 from 'usage/101/\_language-data.md' -## Lightning tour {#lightning-tour} - -The following examples and code snippets give you an overview of spaCy's -functionality and its usage. - -### Install models and process text {#lightning-tour-models} - -```bash -python -m spacy download en_core_web_sm -python -m spacy download de_core_news_sm -``` - -```python -### {executable="true"} -import spacy - -nlp = spacy.load("en_core_web_sm") -doc = nlp("Hello, world. Here are two sentences.") -print([t.text for t in doc]) - -nlp_de = spacy.load("de_core_news_sm") -doc_de = nlp_de("Ich bin ein Berliner.") -print([t.text for t in doc_de]) - -``` - - - -**API:** [`spacy.load()`](/api/top-level#spacy.load) **Usage:** -[Models](/usage/models), [spaCy 101](/usage/spacy-101) - - - -### Get tokens, noun chunks & sentences {#lightning-tour-tokens-sentences model="parser"} - -```python -### {executable="true"} -import spacy - -nlp = spacy.load("en_core_web_sm") -doc = nlp("Peach emoji is where it has always been. Peach is the superior " - "emoji. It's outranking eggplant 🍑 ") -print(doc[0].text) # 'Peach' -print(doc[1].text) # 'emoji' -print(doc[-1].text) # '🍑' -print(doc[17:19].text) # 'outranking eggplant' - -noun_chunks = list(doc.noun_chunks) -print(noun_chunks[0].text) # 'Peach emoji' - -sentences = list(doc.sents) -assert len(sentences) == 3 -print(sentences[1].text) # 'Peach is the superior emoji.' -``` - - - -**API:** [`Doc`](/api/doc), [`Token`](/api/token) **Usage:** -[spaCy 101](/usage/spacy-101) - - - -### Get part-of-speech tags and flags {#lightning-tour-pos-tags model="tagger"} - -```python -### {executable="true"} -import spacy - -nlp = spacy.load("en_core_web_sm") -doc = nlp("Apple is looking at buying U.K. startup for $1 billion") -apple = doc[0] -print("Fine-grained POS tag", apple.pos_, apple.pos) -print("Coarse-grained POS tag", apple.tag_, apple.tag) -print("Word shape", apple.shape_, apple.shape) -print("Alphabetic characters?", apple.is_alpha) -print("Punctuation mark?", apple.is_punct) - -billion = doc[10] -print("Digit?", billion.is_digit) -print("Like a number?", billion.like_num) -print("Like an email address?", billion.like_email) -``` - - - -**API:** [`Token`](/api/token) **Usage:** -[Part-of-speech tagging](/usage/linguistic-features#pos-tagging) - - - -### Use hash values for any string {#lightning-tour-hashes} - -```python -### {executable="true"} -import spacy - -nlp = spacy.load("en_core_web_sm") -doc = nlp("I love coffee") - -coffee_hash = nlp.vocab.strings["coffee"] # 3197928453018144401 -coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee' -print(coffee_hash, coffee_text) -print(doc[2].orth, coffee_hash) # 3197928453018144401 -print(doc[2].text, coffee_text) # 'coffee' - -beer_hash = doc.vocab.strings.add("beer") # 3073001599257881079 -beer_text = doc.vocab.strings[beer_hash] # 'beer' -print(beer_hash, beer_text) - -unicorn_hash = doc.vocab.strings.add("🦄") # 18234233413267120783 -unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄' -print(unicorn_hash, unicorn_text) -``` - - - -**API:** [`StringStore`](/api/stringstore) **Usage:** -[Vocab, hashes and lexemes 101](/usage/spacy-101#vocab) - - - -### Recognize and update named entities {#lightning-tour-entities model="ner"} - -```python -### {executable="true"} -import spacy -from spacy.tokens import Span - -nlp = spacy.load("en_core_web_sm") -doc = nlp("San Francisco considers banning sidewalk delivery robots") -for ent in doc.ents: - print(ent.text, ent.start_char, ent.end_char, ent.label_) - -doc = nlp("FB is hiring a new VP of global policy") -doc.ents = [Span(doc, 0, 1, label="ORG")] -for ent in doc.ents: - print(ent.text, ent.start_char, ent.end_char, ent.label_) -``` - - - -**Usage:** [Named entity recognition](/usage/linguistic-features#named-entities) - - - -### Train and update neural network models {#lightning-tour-training"} - -```python -import random -import spacy -from spacy.gold import Example - -nlp = spacy.load("en_core_web_sm") -train_data = [("Uber blew through $1 million", {"entities": [(0, 4, "ORG")]})] - -with nlp.select_pipes(enable="ner"): - optimizer = nlp.begin_training() - for i in range(10): - random.shuffle(train_data) - for text, annotations in train_data: - doc = nlp.make_doc(text) - example = Example.from_dict(doc, annotations) - nlp.update([example], sgd=optimizer) -nlp.to_disk("/model") -``` - - - -**API:** [`Language.update`](/api/language#update) **Usage:** -[Training spaCy's statistical models](/usage/training) - - - -### Visualize a dependency parse and named entities in your browser {#lightning-tour-displacy model="parser, ner" new="2"} - -> #### Output -> -> ![displaCy visualization](../images/displacy-small.svg) - -```python -from spacy import displacy - -doc_dep = nlp("This is a sentence.") -displacy.serve(doc_dep, style="dep") - -doc_ent = nlp("When Sebastian Thrun started working on self-driving cars at Google " - "in 2007, few people outside of the company took him seriously.") -displacy.serve(doc_ent, style="ent") -``` - - - -**API:** [`displacy`](/api/top-level#displacy) **Usage:** -[Visualizers](/usage/visualizers) - - - -### Get word vectors and similarity {#lightning-tour-word-vectors model="vectors"} - -```python -### {executable="true"} -import spacy - -nlp = spacy.load("en_core_web_md") -doc = nlp("Apple and banana are similar. Pasta and hippo aren't.") - -apple = doc[0] -banana = doc[2] -pasta = doc[6] -hippo = doc[8] - -print("apple <-> banana", apple.similarity(banana)) -print("pasta <-> hippo", pasta.similarity(hippo)) -print(apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector) -``` - -For the best results, you should run this example using the -[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg) model (currently -not available in the live demo). - - - -**Usage:** [Word vectors and similarity](/usage/vectors-embeddings) - - - -### Simple and efficient serialization {#lightning-tour-serialization} - -```python -import spacy -from spacy.tokens import Doc -from spacy.vocab import Vocab - -nlp = spacy.load("en_core_web_sm") -customer_feedback = open("customer_feedback_627.txt").read() -doc = nlp(customer_feedback) -doc.to_disk("/tmp/customer_feedback_627.bin") - -new_doc = Doc(Vocab()).from_disk("/tmp/customer_feedback_627.bin") -``` - - - -**API:** [`Language`](/api/language), [`Doc`](/api/doc) **Usage:** -[Saving and loading models](/usage/saving-loading#models) - - - -### Match text with token rules {#lightning-tour-rule-matcher} - -```python -### {executable="true"} -import spacy -from spacy.matcher import Matcher - -nlp = spacy.load("en_core_web_sm") -matcher = Matcher(nlp.vocab) - -def set_sentiment(matcher, doc, i, matches): - doc.sentiment += 0.1 - -pattern1 = [[{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]] -patterns = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]] -matcher.add("GoogleIO", patterns1) # Match "Google I/O" or "Google i/o" -matcher.add("HAPPY", patterns2, on_match=set_sentiment) # Match one or more happy emoji - -doc = nlp("A text about Google I/O 😀😀") -matches = matcher(doc) - -for match_id, start, end in matches: - string_id = nlp.vocab.strings[match_id] - span = doc[start:end] - print(string_id, span.text) -print("Sentiment", doc.sentiment) -``` - - - -**API:** [`Matcher`](/api/matcher) **Usage:** -[Rule-based matching](/usage/rule-based-matching) - - - -### Minibatched stream processing {#lightning-tour-minibatched} - -```python -texts = ["One document.", "...", "Lots of documents"] -# .pipe streams input, and produces streaming output -iter_texts = (texts[i % 3] for i in range(100000000)) -for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50)): - assert doc.is_parsed - if i == 100: - break -``` - -### Get syntactic dependencies {#lightning-tour-dependencies model="parser"} - -```python -### {executable="true"} -import spacy - -nlp = spacy.load("en_core_web_sm") -doc = nlp("When Sebastian Thrun started working on self-driving cars at Google " - "in 2007, few people outside of the company took him seriously.") - -dep_labels = [] -for token in doc: - while token.head != token: - dep_labels.append(token.dep_) - token = token.head -print(dep_labels) -``` - - - -**API:** [`Token`](/api/token) **Usage:** -[Using the dependency parse](/usage/linguistic-features#dependency-parse) - - - -### Export to numpy arrays {#lightning-tour-numpy-arrays} - -```python -### {executable="true"} -import spacy -from spacy.attrs import ORTH, LIKE_URL - -nlp = spacy.load("en_core_web_sm") -doc = nlp("Check out https://spacy.io") -for token in doc: - print(token.text, token.orth, token.like_url) - -attr_ids = [ORTH, LIKE_URL] -doc_array = doc.to_array(attr_ids) -print(doc_array.shape) -print(len(doc), len(attr_ids)) - -assert doc[0].orth == doc_array[0, 0] -assert doc[1].orth == doc_array[1, 0] -assert doc[0].like_url == doc_array[0, 1] - -assert list(doc_array[:, 1]) == [t.like_url for t in doc] -print(list(doc_array[:, 1])) -``` - -### Calculate inline markup on original string {#lightning-tour-inline} - -```python -### {executable="true"} -import spacy - -def put_spans_around_tokens(doc): - """Here, we're building a custom "syntax highlighter" for - part-of-speech tags and dependencies. We put each token in a - span element, with the appropriate classes computed. All whitespace is - preserved, outside of the spans. (Of course, HTML will only display - multiple whitespace if enabled – but the point is, no information is lost - and you can calculate what you need, e.g.
,

etc.) - """ - output = [] - for token in doc: - if token.is_space: - output.append(token.text) - else: - classes = f"pos-{token.pos_} dep-{token.dep_}" - output.append(f'{token.text}{token.whitespace_}') - string = "".join(output) - string = string.replace("\\n", "") - string = string.replace("\\t", " ") - return f"

{string}
" - - -nlp = spacy.load("en_core_web_sm") -doc = nlp("This is a test.\\n\\nHello world.") -html = put_spans_around_tokens(doc) -print(html) -``` - ## Architecture {#architecture} import Architecture101 from 'usage/101/\_architecture.md'