From 11347f34da5182d35559eae644231a432fb4d9c4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 4 Oct 2020 13:54:05 +0200 Subject: [PATCH] Tidy up, tests and docs --- spacy/pipeline/attributeruler.py | 57 ++++---- spacy/tests/pipeline/test_attributeruler.py | 105 ++++++++------ website/docs/api/attributeruler.md | 145 +++++++++++--------- website/docs/usage/linguistic-features.md | 12 +- website/docs/usage/v3.md | 26 +++- 5 files changed, 193 insertions(+), 152 deletions(-) diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index b4580ff7c..9e6174d07 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -1,10 +1,11 @@ +from typing import List, Dict, Union, Iterable, Any, Optional, Callable, Iterator +from typing import Tuple import srsly -from typing import List, Dict, Union, Iterable, Any, Optional from pathlib import Path from .pipe import Pipe from ..errors import Errors -from ..training import validate_examples +from ..training import validate_examples, Example from ..language import Language from ..matcher import Matcher from ..scorer import Scorer @@ -22,17 +23,9 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]] MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]] -@Language.factory( - "attribute_ruler", default_config={"validate": False} -) -def make_attribute_ruler( - nlp: Language, - name: str, - validate: bool, -): - return AttributeRuler( - nlp.vocab, name, pattern_dicts=pattern_dicts, validate=validate - ) +@Language.factory("attribute_ruler", default_config={"validate": False}) +def make_attribute_ruler(nlp: Language, name: str, validate: bool): + return AttributeRuler(nlp.vocab, name, validate=validate) class AttributeRuler(Pipe): @@ -43,12 +36,7 @@ class AttributeRuler(Pipe): """ def __init__( - self, - vocab: Vocab, - name: str = "attribute_ruler", - *, - pattern_dicts: Optional[Iterable[AttributeRulerPatternType]] = None, - validate: bool = False, + self, vocab: Vocab, name: str = "attribute_ruler", *, validate: bool = False ) -> None: """Create the AttributeRuler. After creation, you can add patterns with the `.initialize()` or `.add_patterns()` methods, or load patterns @@ -57,7 +45,7 @@ class AttributeRuler(Pipe): vocab (Vocab): The vocab. name (str): The pipe name. Defaults to "attribute_ruler". - + RETURNS (AttributeRuler): The AttributeRuler component. DOCS: https://nightly.spacy.io/api/attributeruler#init @@ -71,15 +59,15 @@ class AttributeRuler(Pipe): def initialize( self, - get_examples: Optional[Callable[[], Iterable[Example]]] = None, + get_examples: Optional[Callable[[], Iterable[Example]]], *, nlp: Optional[Language] = None, patterns: Optional[Iterable[AttributeRulerPatternType]] = None, - tag_map: Optional[TagMapType]=None, - morph_rules: Optional[MorphRulesType]=None + tag_map: Optional[TagMapType] = None, + morph_rules: Optional[MorphRulesType] = None, ): """Initialize the attribute ruler by adding zero or more patterns. - + Rules can be specified as a sequence of dicts using the `patterns` keyword argument. You can also provide rules using the "tag map" or "morph rules" formats supported by spaCy prior to v3. @@ -126,7 +114,7 @@ class AttributeRuler(Pipe): set_token_attrs(span[index], attrs) return doc - def pipe(self, stream, *, batch_size=128): + def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: """Apply the pipe to a stream of documents. This usually happens under the hood when the nlp object is called on a text and all components are applied to the Doc. @@ -210,16 +198,16 @@ class AttributeRuler(Pipe): self.attrs.append(attrs) self.indices.append(index) - def add_patterns(self, pattern_dicts: Iterable[AttributeRulerPatternType]) -> None: + def add_patterns(self, patterns: Iterable[AttributeRulerPatternType]) -> None: """Add patterns from a list of pattern dicts with the keys as the arguments to AttributeRuler.add. - pattern_dicts (Iterable[dict]): A list of pattern dicts with the keys + patterns (Iterable[dict]): A list of pattern dicts with the keys as the arguments to AttributeRuler.add (patterns/attrs/index) to add as patterns. DOCS: https://nightly.spacy.io/api/attributeruler#add_patterns """ - for p in pattern_dicts: + for p in patterns: self.add(**p) @property @@ -234,7 +222,7 @@ class AttributeRuler(Pipe): all_patterns.append(p) return all_patterns - def score(self, examples, **kwargs): + def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: """Score a batch of examples. examples (Iterable[Example]): The examples to score. @@ -275,7 +263,7 @@ class AttributeRuler(Pipe): def from_bytes( self, bytes_data: bytes, exclude: Iterable[str] = SimpleFrozenList() - ): + ) -> "AttributeRuler": """Load the AttributeRuler from a bytestring. bytes_data (bytes): The data to load. @@ -293,7 +281,6 @@ class AttributeRuler(Pipe): "patterns": load_patterns, } util.from_bytes(bytes_data, deserialize, exclude) - return self def to_disk( @@ -303,6 +290,7 @@ class AttributeRuler(Pipe): path (Union[Path, str]): A path to a directory. exclude (Iterable[str]): String names of serialization fields to exclude. + DOCS: https://nightly.spacy.io/api/attributeruler#to_disk """ serialize = { @@ -313,11 +301,13 @@ class AttributeRuler(Pipe): def from_disk( self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList() - ) -> None: + ) -> "AttributeRuler": """Load the AttributeRuler from disk. path (Union[Path, str]): A path to a directory. exclude (Iterable[str]): String names of serialization fields to exclude. + RETURNS (AttributeRuler): The loaded object. + DOCS: https://nightly.spacy.io/api/attributeruler#from_disk """ @@ -329,11 +319,10 @@ class AttributeRuler(Pipe): "patterns": load_patterns, } util.from_disk(path, deserialize, exclude) - return self -def _split_morph_attrs(attrs): +def _split_morph_attrs(attrs: dict) -> Tuple[dict, dict]: """Split entries from a tag map or morph rules dict into to two dicts, one with the token-level features (POS, LEMMA) and one with the remaining features, which are presumed to be individual MORPH features.""" diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py index 5773127af..c967bcdcd 100644 --- a/spacy/tests/pipeline/test_attributeruler.py +++ b/spacy/tests/pipeline/test_attributeruler.py @@ -63,6 +63,39 @@ def morph_rules(): return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}} +def check_tag_map(ruler): + doc = Doc( + ruler.vocab, + words=["This", "is", "a", "test", "."], + tags=["DT", "VBZ", "DT", "NN", "."], + ) + doc = ruler(doc) + for i in range(len(doc)): + if i == 4: + assert doc[i].pos_ == "PUNCT" + assert str(doc[i].morph) == "PunctType=peri" + else: + assert doc[i].pos_ == "" + assert str(doc[i].morph) == "" + + +def check_morph_rules(ruler): + doc = Doc( + ruler.vocab, + words=["This", "is", "the", "test", "."], + tags=["DT", "VBZ", "DT", "NN", "."], + ) + doc = ruler(doc) + for i in range(len(doc)): + if i != 2: + assert doc[i].pos_ == "" + assert str(doc[i].morph) == "" + else: + assert doc[2].pos_ == "DET" + assert doc[2].lemma_ == "a" + assert str(doc[2].morph) == "Case=Nom" + + def test_attributeruler_init(nlp, pattern_dicts): a = nlp.add_pipe("attribute_ruler") for p in pattern_dicts: @@ -78,7 +111,8 @@ def test_attributeruler_init(nlp, pattern_dicts): def test_attributeruler_init_patterns(nlp, pattern_dicts): # initialize with patterns - nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts}) + ruler = nlp.add_pipe("attribute_ruler") + ruler.initialize(lambda: [], patterns=pattern_dicts) doc = nlp("This is a test.") assert doc[2].lemma_ == "the" assert str(doc[2].morph) == "Case=Nom|Number=Plur" @@ -88,10 +122,11 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts): assert doc.has_annotation("MORPH") nlp.remove_pipe("attribute_ruler") # initialize with patterns from asset - nlp.add_pipe( - "attribute_ruler", - config={"pattern_dicts": {"@misc": "attribute_ruler_patterns"}}, - ) + nlp.config["initialize"]["components"]["attribute_ruler"] = { + "patterns": {"@misc": "attribute_ruler_patterns"} + } + nlp.add_pipe("attribute_ruler") + nlp.initialize() doc = nlp("This is a test.") assert doc[2].lemma_ == "the" assert str(doc[2].morph) == "Case=Nom|Number=Plur" @@ -103,18 +138,15 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts): def test_attributeruler_score(nlp, pattern_dicts): # initialize with patterns - nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts}) + ruler = nlp.add_pipe("attribute_ruler") + ruler.initialize(lambda: [], patterns=pattern_dicts) doc = nlp("This is a test.") assert doc[2].lemma_ == "the" assert str(doc[2].morph) == "Case=Nom|Number=Plur" assert doc[3].lemma_ == "cat" assert str(doc[3].morph) == "Case=Nom|Number=Sing" - - dev_examples = [ - Example.from_dict( - nlp.make_doc("This is a test."), {"lemmas": ["this", "is", "a", "cat", "."]} - ) - ] + doc = nlp.make_doc("This is a test.") + dev_examples = [Example.from_dict(doc, {"lemmas": ["this", "is", "a", "cat", "."]})] scores = nlp.evaluate(dev_examples) # "cat" is the only correct lemma assert scores["lemma_acc"] == pytest.approx(0.2) @@ -139,40 +171,27 @@ def test_attributeruler_rule_order(nlp): def test_attributeruler_tag_map(nlp, tag_map): - a = AttributeRuler(nlp.vocab) - a.load_from_tag_map(tag_map) - doc = Doc( - nlp.vocab, - words=["This", "is", "a", "test", "."], - tags=["DT", "VBZ", "DT", "NN", "."], - ) - doc = a(doc) - for i in range(len(doc)): - if i == 4: - assert doc[i].pos_ == "PUNCT" - assert str(doc[i].morph) == "PunctType=peri" - else: - assert doc[i].pos_ == "" - assert str(doc[i].morph) == "" + ruler = AttributeRuler(nlp.vocab) + ruler.load_from_tag_map(tag_map) + check_tag_map(ruler) + + +def test_attributeruler_tag_map_initialize(nlp, tag_map): + ruler = nlp.add_pipe("attribute_ruler") + ruler.initialize(lambda: [], tag_map=tag_map) + check_tag_map(ruler) def test_attributeruler_morph_rules(nlp, morph_rules): - a = AttributeRuler(nlp.vocab) - a.load_from_morph_rules(morph_rules) - doc = Doc( - nlp.vocab, - words=["This", "is", "the", "test", "."], - tags=["DT", "VBZ", "DT", "NN", "."], - ) - doc = a(doc) - for i in range(len(doc)): - if i != 2: - assert doc[i].pos_ == "" - assert str(doc[i].morph) == "" - else: - assert doc[2].pos_ == "DET" - assert doc[2].lemma_ == "a" - assert str(doc[2].morph) == "Case=Nom" + ruler = AttributeRuler(nlp.vocab) + ruler.load_from_morph_rules(morph_rules) + check_morph_rules(ruler) + + +def test_attributeruler_morph_rules_initialize(nlp, morph_rules): + ruler = nlp.add_pipe("attribute_ruler") + ruler.initialize(lambda: [], morph_rules=morph_rules) + check_morph_rules(ruler) def test_attributeruler_indices(nlp): diff --git a/website/docs/api/attributeruler.md b/website/docs/api/attributeruler.md index 60fda6bda..b89759080 100644 --- a/website/docs/api/attributeruler.md +++ b/website/docs/api/attributeruler.md @@ -4,6 +4,7 @@ tag: class source: spacy/pipeline/attributeruler.py new: 3 teaser: 'Pipeline component for rule-based token attribute assignment' +api_base_class: /api/pipe api_string_name: attribute_ruler api_trainable: false --- @@ -25,17 +26,13 @@ how the component should be configured. You can override its settings via the > #### Example > > ```python -> config = { -> "pattern_dicts": None, -> "validate": True, -> } +> config = {"validate": True} > nlp.add_pipe("attribute_ruler", config=config) > ``` -| Setting | Description | -| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `pattern_dicts` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](/api/attributeruler#add) (`patterns`/`attrs`/`index`) to add as patterns. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ | -| `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ | +| Setting | Description | +| ---------- | --------------------------------------------------------------------------------------------- | +| `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ | ```python %%GITHUB_SPACY/spacy/pipeline/attributeruler.py @@ -43,36 +40,26 @@ how the component should be configured. You can override its settings via the ## AttributeRuler.\_\_init\_\_ {#init tag="method"} -Initialize the attribute ruler. If pattern dicts are supplied here, they need to -be a list of dictionaries with `"patterns"`, `"attrs"`, and optional `"index"` -keys, e.g.: - -```python -pattern_dicts = [ - {"patterns": [[{"TAG": "VB"}]], "attrs": {"POS": "VERB"}}, - {"patterns": [[{"LOWER": "an"}]], "attrs": {"LEMMA": "a"}}, -] -``` +Initialize the attribute ruler. > #### Example > > ```python > # Construction via add_pipe -> attribute_ruler = nlp.add_pipe("attribute_ruler") +> ruler = nlp.add_pipe("attribute_ruler") > ``` -| Name | Description | -| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary to pass to the matcher. ~~Vocab~~ | -| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ | -| _keyword-only_ | | -| `pattern_dicts` | Optional patterns to load in on initialization. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ | -| `validate` | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~ | +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary to pass to the matcher. ~~Vocab~~ | +| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ | +| _keyword-only_ | | +| `validate` | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~ | ## AttributeRuler.\_\_call\_\_ {#call tag="method"} -Apply the attribute ruler to a `Doc`, setting token attributes for tokens matched -by the provided patterns. +Apply the attribute ruler to a `Doc`, setting token attributes for tokens +matched by the provided patterns. | Name | Description | | ----------- | -------------------------------- | @@ -90,10 +77,10 @@ may be negative to index from the end of the span. > #### Example > > ```python -> attribute_ruler = nlp.add_pipe("attribute_ruler") +> ruler = nlp.add_pipe("attribute_ruler") > patterns = [[{"TAG": "VB"}]] > attrs = {"POS": "VERB"} -> attribute_ruler.add(patterns=patterns, attrs=attrs) +> ruler.add(patterns=patterns, attrs=attrs) > ``` | Name | Description | @@ -107,11 +94,10 @@ may be negative to index from the end of the span. > #### Example > > ```python -> attribute_ruler = nlp.add_pipe("attribute_ruler") -> pattern_dicts = [ +> ruler = nlp.add_pipe("attribute_ruler") +> patterns = [ > { -> "patterns": [[{"TAG": "VB"}]], -> "attrs": {"POS": "VERB"} +> "patterns": [[{"TAG": "VB"}]], "attrs": {"POS": "VERB"} > }, > { > "patterns": [[{"LOWER": "two"}, {"LOWER": "apples"}]], @@ -119,15 +105,16 @@ may be negative to index from the end of the span. > "index": -1 > }, > ] -> attribute_ruler.add_patterns(pattern_dicts) +> ruler.add_patterns(patterns) > ``` -Add patterns from a list of pattern dicts with the keys as the arguments to +Add patterns from a list of pattern dicts. Each pattern dict can specify the +keys `"patterns"`, `"attrs"` and `"index"`, which match the arguments of [`AttributeRuler.add`](/api/attributeruler#add). -| Name | Description | -| --------------- | -------------------------------------------------------------------------- | -| `pattern_dicts` | The patterns to add. ~~Iterable[Dict[str, Union[List[dict], dict, int]]]~~ | +| Name | Description | +| ---------- | -------------------------------------------------------------------------- | +| `patterns` | The patterns to add. ~~Iterable[Dict[str, Union[List[dict], dict, int]]]~~ | ## AttributeRuler.patterns {#patterns tag="property"} @@ -139,20 +126,39 @@ Get all patterns that have been added to the attribute ruler in the | ----------- | -------------------------------------------------------------------------------------------- | | **RETURNS** | The patterns added to the attribute ruler. ~~List[Dict[str, Union[List[dict], dict, int]]]~~ | -## AttributeRuler.score {#score tag="method" new="3"} +## AttributeRuler.initialize {#initialize tag="method"} -Score a batch of examples. +Initialize the component with data. Typically called before training to load in +rules from a file. This method is typically called by +[`Language.initialize`](/api/language#initialize) and lets you customize +arguments it receives via the +[`[initialize.components]`](/api/data-formats#config-initialize) block in the +config. > #### Example > > ```python -> scores = attribute_ruler.score(examples) +> ruler = nlp.add_pipe("attribute_ruler") +> ruler.initialize(lambda: [], nlp=nlp, patterns=patterns) +> ``` +> +> ```ini +> ### config.cfg +> [initialize.components.attribute_ruler] +> +> [initialize.components.attribute_ruler.patterns] +> @readers = "srsly.read_json.v1" +> path = "corpus/attribute_ruler_patterns.json > ``` -| Name | Description | -| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag"`, `"pos"`, `"morph"` and `"lemma"` if present in any of the target token attributes. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects (the training data). Not used by this component. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `patterns` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](/api/attributeruler#add) (`patterns`/`attrs`/`index`) to add as patterns. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ | +| `tag_map` | The tag map that maps fine-grained tags to coarse-grained tags and morphological features. Defaults to `None`. ~~Optional[Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ | +| `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]]~~ | ## AttributeRuler.load_from_tag_map {#load_from_tag_map tag="method"} @@ -170,6 +176,21 @@ Load attribute ruler patterns from morph rules. | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. ~~Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ | +## AttributeRuler.score {#score tag="method" new="3"} + +Score a batch of examples. + +> #### Example +> +> ```python +> scores = ruler.score(examples) +> ``` + +| Name | Description | +| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The examples to score. ~~Iterable[Example]~~ | +| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag"`, `"pos"`, `"morph"` and `"lemma"` if present in any of the target token attributes. ~~Dict[str, float]~~ | + ## AttributeRuler.to_disk {#to_disk tag="method"} Serialize the pipe to disk. @@ -177,8 +198,8 @@ Serialize the pipe to disk. > #### Example > > ```python -> attribute_ruler = nlp.add_pipe("attribute_ruler") -> attribute_ruler.to_disk("/path/to/attribute_ruler") +> ruler = nlp.add_pipe("attribute_ruler") +> ruler.to_disk("/path/to/attribute_ruler") > ``` | Name | Description | @@ -194,8 +215,8 @@ Load the pipe from disk. Modifies the object in place and returns it. > #### Example > > ```python -> attribute_ruler = nlp.add_pipe("attribute_ruler") -> attribute_ruler.from_disk("/path/to/attribute_ruler") +> ruler = nlp.add_pipe("attribute_ruler") +> ruler.from_disk("/path/to/attribute_ruler") > ``` | Name | Description | @@ -210,8 +231,8 @@ Load the pipe from disk. Modifies the object in place and returns it. > #### Example > > ```python -> attribute_ruler = nlp.add_pipe("attribute_ruler") -> attribute_ruler_bytes = attribute_ruler.to_bytes() +> ruler = nlp.add_pipe("attribute_ruler") +> ruler = ruler.to_bytes() > ``` Serialize the pipe to a bytestring. @@ -229,9 +250,9 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > #### Example > > ```python -> attribute_ruler_bytes = attribute_ruler.to_bytes() -> attribute_ruler = nlp.add_pipe("attribute_ruler") -> attribute_ruler.from_bytes(attribute_ruler_bytes) +> ruler_bytes = ruler.to_bytes() +> ruler = nlp.add_pipe("attribute_ruler") +> ruler.from_bytes(ruler_bytes) > ``` | Name | Description | @@ -250,12 +271,12 @@ serialization by passing in the string names via the `exclude` argument. > #### Example > > ```python -> data = attribute_ruler.to_disk("/path", exclude=["vocab"]) +> data = ruler.to_disk("/path", exclude=["vocab"]) > ``` -| Name | Description | -| ---------- | -------------------------------------------------------------- | -| `vocab` | The shared [`Vocab`](/api/vocab). | -| `patterns` | The `Matcher` patterns. You usually don't want to exclude this. | -| `attrs` | The attributes to set. You usually don't want to exclude this. | -| `indices` | The token indices. You usually don't want to exclude this. | +| Name | Description | +| ---------- | --------------------------------------------------------------- | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `patterns` | The `Matcher` patterns. You usually don't want to exclude this. | +| `attrs` | The attributes to set. You usually don't want to exclude this. | +| `indices` | The token indices. You usually don't want to exclude this. | diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 7b9aaa0b9..1964bac18 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1801,17 +1801,7 @@ print(doc2[5].tag_, doc2[5].pos_) # WP PRON -For easy migration from from spaCy v2 to v3, the -[`AttributeRuler`](/api/attributeruler) can import a **tag map and morph rules** -in the v2 format with the methods -[`load_from_tag_map`](/api/attributeruler#load_from_tag_map) and -[`load_from_morph_rules`](/api/attributeruler#load_from_morph_rules). - -```diff -nlp = spacy.blank("en") -+ ruler = nlp.add_pipe("attribute_ruler") -+ ruler.load_from_tag_map(YOUR_TAG_MAP) -``` +The [`AttributeRuler`](/api/attributeruler) can import a **tag map and morph rules** in the v2.x format via its built-in methods or when the component is initialized before training. See the [migration guide](/usage/v3#migrating-training-mappings-exceptions) for details. diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 4ce57af01..a10fc6321 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -804,8 +804,30 @@ nlp = spacy.blank("en") Instead of defining a `tag_map` and `morph_rules` in the language data, spaCy v3.0 now manages mappings and exceptions with a separate and more flexible pipeline component, the [`AttributeRuler`](/api/attributeruler). See the -[usage guide](/usage/linguistic-features#mappings-exceptions) for examples. The -`AttributeRuler` provides two handy helper methods +[usage guide](/usage/linguistic-features#mappings-exceptions) for examples. If +you have tag maps and morph rules in the v2.x format, you can load them into the +attribute ruler before training using the `[initialize]` block of your config. + +> #### What does the initialization do? +> +> The `[initialize]` block is used when +> [`nlp.initialize`](/api/language#initialize) is called (usually right before +> training). It lets you define data resources for initializing the pipeline in +> your `config.cfg`. After training, the rules are saved to disk with the +> exported pipeline, so your runtime model doesn't depend on local data. For +> details see the [config lifecycle](/usage/training/#config-lifecycle) and +> [initialization](/usage/training/#initialization) docs. + +```ini +### config.cfg (excerpt) +[initialize.components.attribute_ruler] + +[initialize.components.attribute_ruler.tag_map] +@readers = "srsly.read_json.v1" +path = "./corpus/tag_map.json" +``` + +The `AttributeRuler` also provides two handy helper methods [`load_from_tag_map`](/api/attributeruler#load_from_tag_map) and [`load_from_morph_rules`](/api/attributeruler#load_from_morph_rules) that let you load in your existing tag map or morph rules: