Merge pull request #6192 from explosion/feature/init-attr-ruler

This commit is contained in:
Ines Montani 2020-10-04 14:46:37 +02:00 committed by GitHub
commit 43d7652635
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 215 additions and 154 deletions

View File

@ -1,10 +1,11 @@
from typing import List, Dict, Union, Iterable, Any, Optional, Callable, Iterator
from typing import Tuple
import srsly import srsly
from typing import List, Dict, Union, Iterable, Any, Optional
from pathlib import Path from pathlib import Path
from .pipe import Pipe from .pipe import Pipe
from ..errors import Errors from ..errors import Errors
from ..training import validate_examples from ..training import validate_examples, Example
from ..language import Language from ..language import Language
from ..matcher import Matcher from ..matcher import Matcher
from ..scorer import Scorer from ..scorer import Scorer
@ -18,20 +19,13 @@ from .. import util
MatcherPatternType = List[Dict[Union[int, str], Any]] MatcherPatternType = List[Dict[Union[int, str], Any]]
AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]] AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]]
TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]]
MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
@Language.factory( @Language.factory("attribute_ruler", default_config={"validate": False})
"attribute_ruler", default_config={"pattern_dicts": None, "validate": False} def make_attribute_ruler(nlp: Language, name: str, validate: bool):
) return AttributeRuler(nlp.vocab, name, validate=validate)
def make_attribute_ruler(
nlp: Language,
name: str,
pattern_dicts: Optional[Iterable[AttributeRulerPatternType]],
validate: bool,
):
return AttributeRuler(
nlp.vocab, name, pattern_dicts=pattern_dicts, validate=validate
)
class AttributeRuler(Pipe): class AttributeRuler(Pipe):
@ -42,20 +36,15 @@ class AttributeRuler(Pipe):
""" """
def __init__( def __init__(
self, self, vocab: Vocab, name: str = "attribute_ruler", *, validate: bool = False
vocab: Vocab,
name: str = "attribute_ruler",
*,
pattern_dicts: Optional[Iterable[AttributeRulerPatternType]] = None,
validate: bool = False,
) -> None: ) -> None:
"""Initialize the AttributeRuler. """Create the AttributeRuler. After creation, you can add patterns
with the `.initialize()` or `.add_patterns()` methods, or load patterns
with `.from_bytes()` or `.from_disk()`. Loading patterns will remove
any patterns you've added previously.
vocab (Vocab): The vocab. vocab (Vocab): The vocab.
name (str): The pipe name. Defaults to "attribute_ruler". name (str): The pipe name. Defaults to "attribute_ruler".
pattern_dicts (Iterable[Dict]): A list of pattern dicts with the keys as
the arguments to AttributeRuler.add (`patterns`/`attrs`/`index`) to add
as patterns.
RETURNS (AttributeRuler): The AttributeRuler component. RETURNS (AttributeRuler): The AttributeRuler component.
@ -68,8 +57,27 @@ class AttributeRuler(Pipe):
self._attrs_unnormed = [] # store for reference self._attrs_unnormed = [] # store for reference
self.indices = [] self.indices = []
if pattern_dicts: def initialize(
self.add_patterns(pattern_dicts) self,
get_examples: Optional[Callable[[], Iterable[Example]]],
*,
nlp: Optional[Language] = None,
patterns: Optional[Iterable[AttributeRulerPatternType]] = None,
tag_map: Optional[TagMapType] = None,
morph_rules: Optional[MorphRulesType] = None,
):
"""Initialize the attribute ruler by adding zero or more patterns.
Rules can be specified as a sequence of dicts using the `patterns`
keyword argument. You can also provide rules using the "tag map" or
"morph rules" formats supported by spaCy prior to v3.
"""
if patterns:
self.add_patterns(patterns)
if tag_map:
self.load_from_tag_map(tag_map)
if morph_rules:
self.load_from_morph_rules(morph_rules)
def __call__(self, doc: Doc) -> Doc: def __call__(self, doc: Doc) -> Doc:
"""Apply the AttributeRuler to a Doc and set all attribute exceptions. """Apply the AttributeRuler to a Doc and set all attribute exceptions.
@ -106,7 +114,7 @@ class AttributeRuler(Pipe):
set_token_attrs(span[index], attrs) set_token_attrs(span[index], attrs)
return doc return doc
def pipe(self, stream, *, batch_size=128): def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
"""Apply the pipe to a stream of documents. This usually happens under """Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are the hood when the nlp object is called on a text and all components are
applied to the Doc. applied to the Doc.
@ -190,16 +198,16 @@ class AttributeRuler(Pipe):
self.attrs.append(attrs) self.attrs.append(attrs)
self.indices.append(index) self.indices.append(index)
def add_patterns(self, pattern_dicts: Iterable[AttributeRulerPatternType]) -> None: def add_patterns(self, patterns: Iterable[AttributeRulerPatternType]) -> None:
"""Add patterns from a list of pattern dicts with the keys as the """Add patterns from a list of pattern dicts with the keys as the
arguments to AttributeRuler.add. arguments to AttributeRuler.add.
pattern_dicts (Iterable[dict]): A list of pattern dicts with the keys patterns (Iterable[dict]): A list of pattern dicts with the keys
as the arguments to AttributeRuler.add (patterns/attrs/index) to as the arguments to AttributeRuler.add (patterns/attrs/index) to
add as patterns. add as patterns.
DOCS: https://nightly.spacy.io/api/attributeruler#add_patterns DOCS: https://nightly.spacy.io/api/attributeruler#add_patterns
""" """
for p in pattern_dicts: for p in patterns:
self.add(**p) self.add(**p)
@property @property
@ -214,7 +222,7 @@ class AttributeRuler(Pipe):
all_patterns.append(p) all_patterns.append(p)
return all_patterns return all_patterns
def score(self, examples, **kwargs): def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples. """Score a batch of examples.
examples (Iterable[Example]): The examples to score. examples (Iterable[Example]): The examples to score.
@ -255,7 +263,7 @@ class AttributeRuler(Pipe):
def from_bytes( def from_bytes(
self, bytes_data: bytes, exclude: Iterable[str] = SimpleFrozenList() self, bytes_data: bytes, exclude: Iterable[str] = SimpleFrozenList()
): ) -> "AttributeRuler":
"""Load the AttributeRuler from a bytestring. """Load the AttributeRuler from a bytestring.
bytes_data (bytes): The data to load. bytes_data (bytes): The data to load.
@ -273,7 +281,6 @@ class AttributeRuler(Pipe):
"patterns": load_patterns, "patterns": load_patterns,
} }
util.from_bytes(bytes_data, deserialize, exclude) util.from_bytes(bytes_data, deserialize, exclude)
return self return self
def to_disk( def to_disk(
@ -283,6 +290,7 @@ class AttributeRuler(Pipe):
path (Union[Path, str]): A path to a directory. path (Union[Path, str]): A path to a directory.
exclude (Iterable[str]): String names of serialization fields to exclude. exclude (Iterable[str]): String names of serialization fields to exclude.
DOCS: https://nightly.spacy.io/api/attributeruler#to_disk DOCS: https://nightly.spacy.io/api/attributeruler#to_disk
""" """
serialize = { serialize = {
@ -293,11 +301,13 @@ class AttributeRuler(Pipe):
def from_disk( def from_disk(
self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList() self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList()
) -> None: ) -> "AttributeRuler":
"""Load the AttributeRuler from disk. """Load the AttributeRuler from disk.
path (Union[Path, str]): A path to a directory. path (Union[Path, str]): A path to a directory.
exclude (Iterable[str]): String names of serialization fields to exclude. exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (AttributeRuler): The loaded object.
DOCS: https://nightly.spacy.io/api/attributeruler#from_disk DOCS: https://nightly.spacy.io/api/attributeruler#from_disk
""" """
@ -309,11 +319,10 @@ class AttributeRuler(Pipe):
"patterns": load_patterns, "patterns": load_patterns,
} }
util.from_disk(path, deserialize, exclude) util.from_disk(path, deserialize, exclude)
return self return self
def _split_morph_attrs(attrs): def _split_morph_attrs(attrs: dict) -> Tuple[dict, dict]:
"""Split entries from a tag map or morph rules dict into to two dicts, one """Split entries from a tag map or morph rules dict into to two dicts, one
with the token-level features (POS, LEMMA) and one with the remaining with the token-level features (POS, LEMMA) and one with the remaining
features, which are presumed to be individual MORPH features.""" features, which are presumed to be individual MORPH features."""

View File

@ -63,6 +63,39 @@ def morph_rules():
return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}} return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}}
def check_tag_map(ruler):
doc = Doc(
ruler.vocab,
words=["This", "is", "a", "test", "."],
tags=["DT", "VBZ", "DT", "NN", "."],
)
doc = ruler(doc)
for i in range(len(doc)):
if i == 4:
assert doc[i].pos_ == "PUNCT"
assert str(doc[i].morph) == "PunctType=peri"
else:
assert doc[i].pos_ == ""
assert str(doc[i].morph) == ""
def check_morph_rules(ruler):
doc = Doc(
ruler.vocab,
words=["This", "is", "the", "test", "."],
tags=["DT", "VBZ", "DT", "NN", "."],
)
doc = ruler(doc)
for i in range(len(doc)):
if i != 2:
assert doc[i].pos_ == ""
assert str(doc[i].morph) == ""
else:
assert doc[2].pos_ == "DET"
assert doc[2].lemma_ == "a"
assert str(doc[2].morph) == "Case=Nom"
def test_attributeruler_init(nlp, pattern_dicts): def test_attributeruler_init(nlp, pattern_dicts):
a = nlp.add_pipe("attribute_ruler") a = nlp.add_pipe("attribute_ruler")
for p in pattern_dicts: for p in pattern_dicts:
@ -78,7 +111,8 @@ def test_attributeruler_init(nlp, pattern_dicts):
def test_attributeruler_init_patterns(nlp, pattern_dicts): def test_attributeruler_init_patterns(nlp, pattern_dicts):
# initialize with patterns # initialize with patterns
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts}) ruler = nlp.add_pipe("attribute_ruler")
ruler.initialize(lambda: [], patterns=pattern_dicts)
doc = nlp("This is a test.") doc = nlp("This is a test.")
assert doc[2].lemma_ == "the" assert doc[2].lemma_ == "the"
assert str(doc[2].morph) == "Case=Nom|Number=Plur" assert str(doc[2].morph) == "Case=Nom|Number=Plur"
@ -88,10 +122,11 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
assert doc.has_annotation("MORPH") assert doc.has_annotation("MORPH")
nlp.remove_pipe("attribute_ruler") nlp.remove_pipe("attribute_ruler")
# initialize with patterns from asset # initialize with patterns from asset
nlp.add_pipe( nlp.config["initialize"]["components"]["attribute_ruler"] = {
"attribute_ruler", "patterns": {"@misc": "attribute_ruler_patterns"}
config={"pattern_dicts": {"@misc": "attribute_ruler_patterns"}}, }
) nlp.add_pipe("attribute_ruler")
nlp.initialize()
doc = nlp("This is a test.") doc = nlp("This is a test.")
assert doc[2].lemma_ == "the" assert doc[2].lemma_ == "the"
assert str(doc[2].morph) == "Case=Nom|Number=Plur" assert str(doc[2].morph) == "Case=Nom|Number=Plur"
@ -103,18 +138,15 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
def test_attributeruler_score(nlp, pattern_dicts): def test_attributeruler_score(nlp, pattern_dicts):
# initialize with patterns # initialize with patterns
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts}) ruler = nlp.add_pipe("attribute_ruler")
ruler.initialize(lambda: [], patterns=pattern_dicts)
doc = nlp("This is a test.") doc = nlp("This is a test.")
assert doc[2].lemma_ == "the" assert doc[2].lemma_ == "the"
assert str(doc[2].morph) == "Case=Nom|Number=Plur" assert str(doc[2].morph) == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat" assert doc[3].lemma_ == "cat"
assert str(doc[3].morph) == "Case=Nom|Number=Sing" assert str(doc[3].morph) == "Case=Nom|Number=Sing"
doc = nlp.make_doc("This is a test.")
dev_examples = [ dev_examples = [Example.from_dict(doc, {"lemmas": ["this", "is", "a", "cat", "."]})]
Example.from_dict(
nlp.make_doc("This is a test."), {"lemmas": ["this", "is", "a", "cat", "."]}
)
]
scores = nlp.evaluate(dev_examples) scores = nlp.evaluate(dev_examples)
# "cat" is the only correct lemma # "cat" is the only correct lemma
assert scores["lemma_acc"] == pytest.approx(0.2) assert scores["lemma_acc"] == pytest.approx(0.2)
@ -139,40 +171,27 @@ def test_attributeruler_rule_order(nlp):
def test_attributeruler_tag_map(nlp, tag_map): def test_attributeruler_tag_map(nlp, tag_map):
a = AttributeRuler(nlp.vocab) ruler = AttributeRuler(nlp.vocab)
a.load_from_tag_map(tag_map) ruler.load_from_tag_map(tag_map)
doc = Doc( check_tag_map(ruler)
nlp.vocab,
words=["This", "is", "a", "test", "."],
tags=["DT", "VBZ", "DT", "NN", "."], def test_attributeruler_tag_map_initialize(nlp, tag_map):
) ruler = nlp.add_pipe("attribute_ruler")
doc = a(doc) ruler.initialize(lambda: [], tag_map=tag_map)
for i in range(len(doc)): check_tag_map(ruler)
if i == 4:
assert doc[i].pos_ == "PUNCT"
assert str(doc[i].morph) == "PunctType=peri"
else:
assert doc[i].pos_ == ""
assert str(doc[i].morph) == ""
def test_attributeruler_morph_rules(nlp, morph_rules): def test_attributeruler_morph_rules(nlp, morph_rules):
a = AttributeRuler(nlp.vocab) ruler = AttributeRuler(nlp.vocab)
a.load_from_morph_rules(morph_rules) ruler.load_from_morph_rules(morph_rules)
doc = Doc( check_morph_rules(ruler)
nlp.vocab,
words=["This", "is", "the", "test", "."],
tags=["DT", "VBZ", "DT", "NN", "."], def test_attributeruler_morph_rules_initialize(nlp, morph_rules):
) ruler = nlp.add_pipe("attribute_ruler")
doc = a(doc) ruler.initialize(lambda: [], morph_rules=morph_rules)
for i in range(len(doc)): check_morph_rules(ruler)
if i != 2:
assert doc[i].pos_ == ""
assert str(doc[i].morph) == ""
else:
assert doc[2].pos_ == "DET"
assert doc[2].lemma_ == "a"
assert str(doc[2].morph) == "Case=Nom"
def test_attributeruler_indices(nlp): def test_attributeruler_indices(nlp):

View File

@ -4,6 +4,7 @@ tag: class
source: spacy/pipeline/attributeruler.py source: spacy/pipeline/attributeruler.py
new: 3 new: 3
teaser: 'Pipeline component for rule-based token attribute assignment' teaser: 'Pipeline component for rule-based token attribute assignment'
api_base_class: /api/pipe
api_string_name: attribute_ruler api_string_name: attribute_ruler
api_trainable: false api_trainable: false
--- ---
@ -25,17 +26,13 @@ how the component should be configured. You can override its settings via the
> #### Example > #### Example
> >
> ```python > ```python
> config = { > config = {"validate": True}
> "pattern_dicts": None,
> "validate": True,
> }
> nlp.add_pipe("attribute_ruler", config=config) > nlp.add_pipe("attribute_ruler", config=config)
> ``` > ```
| Setting | Description | | Setting | Description |
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------- | --------------------------------------------------------------------------------------------- |
| `pattern_dicts` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](/api/attributeruler#add) (`patterns`/`attrs`/`index`) to add as patterns. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ | | `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ |
| `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ |
```python ```python
%%GITHUB_SPACY/spacy/pipeline/attributeruler.py %%GITHUB_SPACY/spacy/pipeline/attributeruler.py
@ -43,36 +40,26 @@ how the component should be configured. You can override its settings via the
## AttributeRuler.\_\_init\_\_ {#init tag="method"} ## AttributeRuler.\_\_init\_\_ {#init tag="method"}
Initialize the attribute ruler. If pattern dicts are supplied here, they need to Initialize the attribute ruler.
be a list of dictionaries with `"patterns"`, `"attrs"`, and optional `"index"`
keys, e.g.:
```python
pattern_dicts = [
{"patterns": [[{"TAG": "VB"}]], "attrs": {"POS": "VERB"}},
{"patterns": [[{"LOWER": "an"}]], "attrs": {"LEMMA": "a"}},
]
```
> #### Example > #### Example
> >
> ```python > ```python
> # Construction via add_pipe > # Construction via add_pipe
> attribute_ruler = nlp.add_pipe("attribute_ruler") > ruler = nlp.add_pipe("attribute_ruler")
> ``` > ```
| Name | Description | | Name | Description |
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary to pass to the matcher. ~~Vocab~~ | | `vocab` | The shared vocabulary to pass to the matcher. ~~Vocab~~ |
| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ | | `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `pattern_dicts` | Optional patterns to load in on initialization. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ | | `validate` | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~ |
| `validate` | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~ |
## AttributeRuler.\_\_call\_\_ {#call tag="method"} ## AttributeRuler.\_\_call\_\_ {#call tag="method"}
Apply the attribute ruler to a `Doc`, setting token attributes for tokens matched Apply the attribute ruler to a `Doc`, setting token attributes for tokens
by the provided patterns. matched by the provided patterns.
| Name | Description | | Name | Description |
| ----------- | -------------------------------- | | ----------- | -------------------------------- |
@ -90,10 +77,10 @@ may be negative to index from the end of the span.
> #### Example > #### Example
> >
> ```python > ```python
> attribute_ruler = nlp.add_pipe("attribute_ruler") > ruler = nlp.add_pipe("attribute_ruler")
> patterns = [[{"TAG": "VB"}]] > patterns = [[{"TAG": "VB"}]]
> attrs = {"POS": "VERB"} > attrs = {"POS": "VERB"}
> attribute_ruler.add(patterns=patterns, attrs=attrs) > ruler.add(patterns=patterns, attrs=attrs)
> ``` > ```
| Name | Description | | Name | Description |
@ -107,11 +94,10 @@ may be negative to index from the end of the span.
> #### Example > #### Example
> >
> ```python > ```python
> attribute_ruler = nlp.add_pipe("attribute_ruler") > ruler = nlp.add_pipe("attribute_ruler")
> pattern_dicts = [ > patterns = [
> { > {
> "patterns": [[{"TAG": "VB"}]], > "patterns": [[{"TAG": "VB"}]], "attrs": {"POS": "VERB"}
> "attrs": {"POS": "VERB"}
> }, > },
> { > {
> "patterns": [[{"LOWER": "two"}, {"LOWER": "apples"}]], > "patterns": [[{"LOWER": "two"}, {"LOWER": "apples"}]],
@ -119,15 +105,16 @@ may be negative to index from the end of the span.
> "index": -1 > "index": -1
> }, > },
> ] > ]
> attribute_ruler.add_patterns(pattern_dicts) > ruler.add_patterns(patterns)
> ``` > ```
Add patterns from a list of pattern dicts with the keys as the arguments to Add patterns from a list of pattern dicts. Each pattern dict can specify the
keys `"patterns"`, `"attrs"` and `"index"`, which match the arguments of
[`AttributeRuler.add`](/api/attributeruler#add). [`AttributeRuler.add`](/api/attributeruler#add).
| Name | Description | | Name | Description |
| --------------- | -------------------------------------------------------------------------- | | ---------- | -------------------------------------------------------------------------- |
| `pattern_dicts` | The patterns to add. ~~Iterable[Dict[str, Union[List[dict], dict, int]]]~~ | | `patterns` | The patterns to add. ~~Iterable[Dict[str, Union[List[dict], dict, int]]]~~ |
## AttributeRuler.patterns {#patterns tag="property"} ## AttributeRuler.patterns {#patterns tag="property"}
@ -139,20 +126,39 @@ Get all patterns that have been added to the attribute ruler in the
| ----------- | -------------------------------------------------------------------------------------------- | | ----------- | -------------------------------------------------------------------------------------------- |
| **RETURNS** | The patterns added to the attribute ruler. ~~List[Dict[str, Union[List[dict], dict, int]]]~~ | | **RETURNS** | The patterns added to the attribute ruler. ~~List[Dict[str, Union[List[dict], dict, int]]]~~ |
## AttributeRuler.score {#score tag="method" new="3"} ## AttributeRuler.initialize {#initialize tag="method"}
Score a batch of examples. Initialize the component with data. Typically called before training to load in
rules from a file. This method is typically called by
[`Language.initialize`](/api/language#initialize) and lets you customize
arguments it receives via the
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
config.
> #### Example > #### Example
> >
> ```python > ```python
> scores = attribute_ruler.score(examples) > ruler = nlp.add_pipe("attribute_ruler")
> ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
> ```
>
> ```ini
> ### config.cfg
> [initialize.components.attribute_ruler]
>
> [initialize.components.attribute_ruler.patterns]
> @readers = "srsly.read_json.v1"
> path = "corpus/attribute_ruler_patterns.json
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The examples to score. ~~Iterable[Example]~~ | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects (the training data). Not used by this component. ~~Callable[[], Iterable[Example]]~~ |
| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag"`, `"pos"`, `"morph"` and `"lemma"` if present in any of the target token attributes. ~~Dict[str, float]~~ | | _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `patterns` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](/api/attributeruler#add) (`patterns`/`attrs`/`index`) to add as patterns. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ |
| `tag_map` | The tag map that maps fine-grained tags to coarse-grained tags and morphological features. Defaults to `None`. ~~Optional[Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ |
| `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]]~~ |
## AttributeRuler.load_from_tag_map {#load_from_tag_map tag="method"} ## AttributeRuler.load_from_tag_map {#load_from_tag_map tag="method"}
@ -170,6 +176,21 @@ Load attribute ruler patterns from morph rules.
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. ~~Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ | | `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. ~~Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ |
## AttributeRuler.score {#score tag="method" new="3"}
Score a batch of examples.
> #### Example
>
> ```python
> scores = ruler.score(examples)
> ```
| Name | Description |
| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The examples to score. ~~Iterable[Example]~~ |
| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag"`, `"pos"`, `"morph"` and `"lemma"` if present in any of the target token attributes. ~~Dict[str, float]~~ |
## AttributeRuler.to_disk {#to_disk tag="method"} ## AttributeRuler.to_disk {#to_disk tag="method"}
Serialize the pipe to disk. Serialize the pipe to disk.
@ -177,8 +198,8 @@ Serialize the pipe to disk.
> #### Example > #### Example
> >
> ```python > ```python
> attribute_ruler = nlp.add_pipe("attribute_ruler") > ruler = nlp.add_pipe("attribute_ruler")
> attribute_ruler.to_disk("/path/to/attribute_ruler") > ruler.to_disk("/path/to/attribute_ruler")
> ``` > ```
| Name | Description | | Name | Description |
@ -194,8 +215,8 @@ Load the pipe from disk. Modifies the object in place and returns it.
> #### Example > #### Example
> >
> ```python > ```python
> attribute_ruler = nlp.add_pipe("attribute_ruler") > ruler = nlp.add_pipe("attribute_ruler")
> attribute_ruler.from_disk("/path/to/attribute_ruler") > ruler.from_disk("/path/to/attribute_ruler")
> ``` > ```
| Name | Description | | Name | Description |
@ -210,8 +231,8 @@ Load the pipe from disk. Modifies the object in place and returns it.
> #### Example > #### Example
> >
> ```python > ```python
> attribute_ruler = nlp.add_pipe("attribute_ruler") > ruler = nlp.add_pipe("attribute_ruler")
> attribute_ruler_bytes = attribute_ruler.to_bytes() > ruler = ruler.to_bytes()
> ``` > ```
Serialize the pipe to a bytestring. Serialize the pipe to a bytestring.
@ -229,9 +250,9 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> #### Example > #### Example
> >
> ```python > ```python
> attribute_ruler_bytes = attribute_ruler.to_bytes() > ruler_bytes = ruler.to_bytes()
> attribute_ruler = nlp.add_pipe("attribute_ruler") > ruler = nlp.add_pipe("attribute_ruler")
> attribute_ruler.from_bytes(attribute_ruler_bytes) > ruler.from_bytes(ruler_bytes)
> ``` > ```
| Name | Description | | Name | Description |
@ -250,12 +271,12 @@ serialization by passing in the string names via the `exclude` argument.
> #### Example > #### Example
> >
> ```python > ```python
> data = attribute_ruler.to_disk("/path", exclude=["vocab"]) > data = ruler.to_disk("/path", exclude=["vocab"])
> ``` > ```
| Name | Description | | Name | Description |
| ---------- | -------------------------------------------------------------- | | ---------- | --------------------------------------------------------------- |
| `vocab` | The shared [`Vocab`](/api/vocab). | | `vocab` | The shared [`Vocab`](/api/vocab). |
| `patterns` | The `Matcher` patterns. You usually don't want to exclude this. | | `patterns` | The `Matcher` patterns. You usually don't want to exclude this. |
| `attrs` | The attributes to set. You usually don't want to exclude this. | | `attrs` | The attributes to set. You usually don't want to exclude this. |
| `indices` | The token indices. You usually don't want to exclude this. | | `indices` | The token indices. You usually don't want to exclude this. |

View File

@ -1801,17 +1801,7 @@ print(doc2[5].tag_, doc2[5].pos_) # WP PRON
<Infobox variant="warning" title="Migrating from spaCy v2.x"> <Infobox variant="warning" title="Migrating from spaCy v2.x">
For easy migration from from spaCy v2 to v3, the The [`AttributeRuler`](/api/attributeruler) can import a **tag map and morph rules** in the v2.x format via its built-in methods or when the component is initialized before training. See the [migration guide](/usage/v3#migrating-training-mappings-exceptions) for details.
[`AttributeRuler`](/api/attributeruler) can import a **tag map and morph rules**
in the v2 format with the methods
[`load_from_tag_map`](/api/attributeruler#load_from_tag_map) and
[`load_from_morph_rules`](/api/attributeruler#load_from_morph_rules).
```diff
nlp = spacy.blank("en")
+ ruler = nlp.add_pipe("attribute_ruler")
+ ruler.load_from_tag_map(YOUR_TAG_MAP)
```
</Infobox> </Infobox>

View File

@ -804,8 +804,30 @@ nlp = spacy.blank("en")
Instead of defining a `tag_map` and `morph_rules` in the language data, spaCy Instead of defining a `tag_map` and `morph_rules` in the language data, spaCy
v3.0 now manages mappings and exceptions with a separate and more flexible v3.0 now manages mappings and exceptions with a separate and more flexible
pipeline component, the [`AttributeRuler`](/api/attributeruler). See the pipeline component, the [`AttributeRuler`](/api/attributeruler). See the
[usage guide](/usage/linguistic-features#mappings-exceptions) for examples. The [usage guide](/usage/linguistic-features#mappings-exceptions) for examples. If
`AttributeRuler` provides two handy helper methods you have tag maps and morph rules in the v2.x format, you can load them into the
attribute ruler before training using the `[initialize]` block of your config.
> #### What does the initialization do?
>
> The `[initialize]` block is used when
> [`nlp.initialize`](/api/language#initialize) is called (usually right before
> training). It lets you define data resources for initializing the pipeline in
> your `config.cfg`. After training, the rules are saved to disk with the
> exported pipeline, so your runtime model doesn't depend on local data. For
> details see the [config lifecycle](/usage/training/#config-lifecycle) and
> [initialization](/usage/training/#initialization) docs.
```ini
### config.cfg (excerpt)
[initialize.components.attribute_ruler]
[initialize.components.attribute_ruler.tag_map]
@readers = "srsly.read_json.v1"
path = "./corpus/tag_map.json"
```
The `AttributeRuler` also provides two handy helper methods
[`load_from_tag_map`](/api/attributeruler#load_from_tag_map) and [`load_from_tag_map`](/api/attributeruler#load_from_tag_map) and
[`load_from_morph_rules`](/api/attributeruler#load_from_morph_rules) that let [`load_from_morph_rules`](/api/attributeruler#load_from_morph_rules) that let
you load in your existing tag map or morph rules: you load in your existing tag map or morph rules: