mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Merge pull request #6192 from explosion/feature/init-attr-ruler
This commit is contained in:
commit
43d7652635
|
@ -1,10 +1,11 @@
|
|||
from typing import List, Dict, Union, Iterable, Any, Optional, Callable, Iterator
|
||||
from typing import Tuple
|
||||
import srsly
|
||||
from typing import List, Dict, Union, Iterable, Any, Optional
|
||||
from pathlib import Path
|
||||
|
||||
from .pipe import Pipe
|
||||
from ..errors import Errors
|
||||
from ..training import validate_examples
|
||||
from ..training import validate_examples, Example
|
||||
from ..language import Language
|
||||
from ..matcher import Matcher
|
||||
from ..scorer import Scorer
|
||||
|
@ -18,20 +19,13 @@ from .. import util
|
|||
|
||||
MatcherPatternType = List[Dict[Union[int, str], Any]]
|
||||
AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]]
|
||||
TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]]
|
||||
MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"attribute_ruler", default_config={"pattern_dicts": None, "validate": False}
|
||||
)
|
||||
def make_attribute_ruler(
|
||||
nlp: Language,
|
||||
name: str,
|
||||
pattern_dicts: Optional[Iterable[AttributeRulerPatternType]],
|
||||
validate: bool,
|
||||
):
|
||||
return AttributeRuler(
|
||||
nlp.vocab, name, pattern_dicts=pattern_dicts, validate=validate
|
||||
)
|
||||
@Language.factory("attribute_ruler", default_config={"validate": False})
|
||||
def make_attribute_ruler(nlp: Language, name: str, validate: bool):
|
||||
return AttributeRuler(nlp.vocab, name, validate=validate)
|
||||
|
||||
|
||||
class AttributeRuler(Pipe):
|
||||
|
@ -42,20 +36,15 @@ class AttributeRuler(Pipe):
|
|||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Vocab,
|
||||
name: str = "attribute_ruler",
|
||||
*,
|
||||
pattern_dicts: Optional[Iterable[AttributeRulerPatternType]] = None,
|
||||
validate: bool = False,
|
||||
self, vocab: Vocab, name: str = "attribute_ruler", *, validate: bool = False
|
||||
) -> None:
|
||||
"""Initialize the AttributeRuler.
|
||||
"""Create the AttributeRuler. After creation, you can add patterns
|
||||
with the `.initialize()` or `.add_patterns()` methods, or load patterns
|
||||
with `.from_bytes()` or `.from_disk()`. Loading patterns will remove
|
||||
any patterns you've added previously.
|
||||
|
||||
vocab (Vocab): The vocab.
|
||||
name (str): The pipe name. Defaults to "attribute_ruler".
|
||||
pattern_dicts (Iterable[Dict]): A list of pattern dicts with the keys as
|
||||
the arguments to AttributeRuler.add (`patterns`/`attrs`/`index`) to add
|
||||
as patterns.
|
||||
|
||||
RETURNS (AttributeRuler): The AttributeRuler component.
|
||||
|
||||
|
@ -68,8 +57,27 @@ class AttributeRuler(Pipe):
|
|||
self._attrs_unnormed = [] # store for reference
|
||||
self.indices = []
|
||||
|
||||
if pattern_dicts:
|
||||
self.add_patterns(pattern_dicts)
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Optional[Callable[[], Iterable[Example]]],
|
||||
*,
|
||||
nlp: Optional[Language] = None,
|
||||
patterns: Optional[Iterable[AttributeRulerPatternType]] = None,
|
||||
tag_map: Optional[TagMapType] = None,
|
||||
morph_rules: Optional[MorphRulesType] = None,
|
||||
):
|
||||
"""Initialize the attribute ruler by adding zero or more patterns.
|
||||
|
||||
Rules can be specified as a sequence of dicts using the `patterns`
|
||||
keyword argument. You can also provide rules using the "tag map" or
|
||||
"morph rules" formats supported by spaCy prior to v3.
|
||||
"""
|
||||
if patterns:
|
||||
self.add_patterns(patterns)
|
||||
if tag_map:
|
||||
self.load_from_tag_map(tag_map)
|
||||
if morph_rules:
|
||||
self.load_from_morph_rules(morph_rules)
|
||||
|
||||
def __call__(self, doc: Doc) -> Doc:
|
||||
"""Apply the AttributeRuler to a Doc and set all attribute exceptions.
|
||||
|
@ -106,7 +114,7 @@ class AttributeRuler(Pipe):
|
|||
set_token_attrs(span[index], attrs)
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, *, batch_size=128):
|
||||
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
||||
"""Apply the pipe to a stream of documents. This usually happens under
|
||||
the hood when the nlp object is called on a text and all components are
|
||||
applied to the Doc.
|
||||
|
@ -190,16 +198,16 @@ class AttributeRuler(Pipe):
|
|||
self.attrs.append(attrs)
|
||||
self.indices.append(index)
|
||||
|
||||
def add_patterns(self, pattern_dicts: Iterable[AttributeRulerPatternType]) -> None:
|
||||
def add_patterns(self, patterns: Iterable[AttributeRulerPatternType]) -> None:
|
||||
"""Add patterns from a list of pattern dicts with the keys as the
|
||||
arguments to AttributeRuler.add.
|
||||
pattern_dicts (Iterable[dict]): A list of pattern dicts with the keys
|
||||
patterns (Iterable[dict]): A list of pattern dicts with the keys
|
||||
as the arguments to AttributeRuler.add (patterns/attrs/index) to
|
||||
add as patterns.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/attributeruler#add_patterns
|
||||
"""
|
||||
for p in pattern_dicts:
|
||||
for p in patterns:
|
||||
self.add(**p)
|
||||
|
||||
@property
|
||||
|
@ -214,7 +222,7 @@ class AttributeRuler(Pipe):
|
|||
all_patterns.append(p)
|
||||
return all_patterns
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
|
@ -255,7 +263,7 @@ class AttributeRuler(Pipe):
|
|||
|
||||
def from_bytes(
|
||||
self, bytes_data: bytes, exclude: Iterable[str] = SimpleFrozenList()
|
||||
):
|
||||
) -> "AttributeRuler":
|
||||
"""Load the AttributeRuler from a bytestring.
|
||||
|
||||
bytes_data (bytes): The data to load.
|
||||
|
@ -273,7 +281,6 @@ class AttributeRuler(Pipe):
|
|||
"patterns": load_patterns,
|
||||
}
|
||||
util.from_bytes(bytes_data, deserialize, exclude)
|
||||
|
||||
return self
|
||||
|
||||
def to_disk(
|
||||
|
@ -283,6 +290,7 @@ class AttributeRuler(Pipe):
|
|||
|
||||
path (Union[Path, str]): A path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/attributeruler#to_disk
|
||||
"""
|
||||
serialize = {
|
||||
|
@ -293,11 +301,13 @@ class AttributeRuler(Pipe):
|
|||
|
||||
def from_disk(
|
||||
self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> None:
|
||||
) -> "AttributeRuler":
|
||||
"""Load the AttributeRuler from disk.
|
||||
|
||||
path (Union[Path, str]): A path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (AttributeRuler): The loaded object.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/attributeruler#from_disk
|
||||
"""
|
||||
|
||||
|
@ -309,11 +319,10 @@ class AttributeRuler(Pipe):
|
|||
"patterns": load_patterns,
|
||||
}
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
|
||||
return self
|
||||
|
||||
|
||||
def _split_morph_attrs(attrs):
|
||||
def _split_morph_attrs(attrs: dict) -> Tuple[dict, dict]:
|
||||
"""Split entries from a tag map or morph rules dict into to two dicts, one
|
||||
with the token-level features (POS, LEMMA) and one with the remaining
|
||||
features, which are presumed to be individual MORPH features."""
|
||||
|
|
|
@ -63,6 +63,39 @@ def morph_rules():
|
|||
return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}}
|
||||
|
||||
|
||||
def check_tag_map(ruler):
|
||||
doc = Doc(
|
||||
ruler.vocab,
|
||||
words=["This", "is", "a", "test", "."],
|
||||
tags=["DT", "VBZ", "DT", "NN", "."],
|
||||
)
|
||||
doc = ruler(doc)
|
||||
for i in range(len(doc)):
|
||||
if i == 4:
|
||||
assert doc[i].pos_ == "PUNCT"
|
||||
assert str(doc[i].morph) == "PunctType=peri"
|
||||
else:
|
||||
assert doc[i].pos_ == ""
|
||||
assert str(doc[i].morph) == ""
|
||||
|
||||
|
||||
def check_morph_rules(ruler):
|
||||
doc = Doc(
|
||||
ruler.vocab,
|
||||
words=["This", "is", "the", "test", "."],
|
||||
tags=["DT", "VBZ", "DT", "NN", "."],
|
||||
)
|
||||
doc = ruler(doc)
|
||||
for i in range(len(doc)):
|
||||
if i != 2:
|
||||
assert doc[i].pos_ == ""
|
||||
assert str(doc[i].morph) == ""
|
||||
else:
|
||||
assert doc[2].pos_ == "DET"
|
||||
assert doc[2].lemma_ == "a"
|
||||
assert str(doc[2].morph) == "Case=Nom"
|
||||
|
||||
|
||||
def test_attributeruler_init(nlp, pattern_dicts):
|
||||
a = nlp.add_pipe("attribute_ruler")
|
||||
for p in pattern_dicts:
|
||||
|
@ -78,7 +111,8 @@ def test_attributeruler_init(nlp, pattern_dicts):
|
|||
|
||||
def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
||||
# initialize with patterns
|
||||
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
||||
ruler = nlp.add_pipe("attribute_ruler")
|
||||
ruler.initialize(lambda: [], patterns=pattern_dicts)
|
||||
doc = nlp("This is a test.")
|
||||
assert doc[2].lemma_ == "the"
|
||||
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||
|
@ -88,10 +122,11 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
|||
assert doc.has_annotation("MORPH")
|
||||
nlp.remove_pipe("attribute_ruler")
|
||||
# initialize with patterns from asset
|
||||
nlp.add_pipe(
|
||||
"attribute_ruler",
|
||||
config={"pattern_dicts": {"@misc": "attribute_ruler_patterns"}},
|
||||
)
|
||||
nlp.config["initialize"]["components"]["attribute_ruler"] = {
|
||||
"patterns": {"@misc": "attribute_ruler_patterns"}
|
||||
}
|
||||
nlp.add_pipe("attribute_ruler")
|
||||
nlp.initialize()
|
||||
doc = nlp("This is a test.")
|
||||
assert doc[2].lemma_ == "the"
|
||||
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||
|
@ -103,18 +138,15 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
|||
|
||||
def test_attributeruler_score(nlp, pattern_dicts):
|
||||
# initialize with patterns
|
||||
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
||||
ruler = nlp.add_pipe("attribute_ruler")
|
||||
ruler.initialize(lambda: [], patterns=pattern_dicts)
|
||||
doc = nlp("This is a test.")
|
||||
assert doc[2].lemma_ == "the"
|
||||
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||
assert doc[3].lemma_ == "cat"
|
||||
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
|
||||
|
||||
dev_examples = [
|
||||
Example.from_dict(
|
||||
nlp.make_doc("This is a test."), {"lemmas": ["this", "is", "a", "cat", "."]}
|
||||
)
|
||||
]
|
||||
doc = nlp.make_doc("This is a test.")
|
||||
dev_examples = [Example.from_dict(doc, {"lemmas": ["this", "is", "a", "cat", "."]})]
|
||||
scores = nlp.evaluate(dev_examples)
|
||||
# "cat" is the only correct lemma
|
||||
assert scores["lemma_acc"] == pytest.approx(0.2)
|
||||
|
@ -139,40 +171,27 @@ def test_attributeruler_rule_order(nlp):
|
|||
|
||||
|
||||
def test_attributeruler_tag_map(nlp, tag_map):
|
||||
a = AttributeRuler(nlp.vocab)
|
||||
a.load_from_tag_map(tag_map)
|
||||
doc = Doc(
|
||||
nlp.vocab,
|
||||
words=["This", "is", "a", "test", "."],
|
||||
tags=["DT", "VBZ", "DT", "NN", "."],
|
||||
)
|
||||
doc = a(doc)
|
||||
for i in range(len(doc)):
|
||||
if i == 4:
|
||||
assert doc[i].pos_ == "PUNCT"
|
||||
assert str(doc[i].morph) == "PunctType=peri"
|
||||
else:
|
||||
assert doc[i].pos_ == ""
|
||||
assert str(doc[i].morph) == ""
|
||||
ruler = AttributeRuler(nlp.vocab)
|
||||
ruler.load_from_tag_map(tag_map)
|
||||
check_tag_map(ruler)
|
||||
|
||||
|
||||
def test_attributeruler_tag_map_initialize(nlp, tag_map):
|
||||
ruler = nlp.add_pipe("attribute_ruler")
|
||||
ruler.initialize(lambda: [], tag_map=tag_map)
|
||||
check_tag_map(ruler)
|
||||
|
||||
|
||||
def test_attributeruler_morph_rules(nlp, morph_rules):
|
||||
a = AttributeRuler(nlp.vocab)
|
||||
a.load_from_morph_rules(morph_rules)
|
||||
doc = Doc(
|
||||
nlp.vocab,
|
||||
words=["This", "is", "the", "test", "."],
|
||||
tags=["DT", "VBZ", "DT", "NN", "."],
|
||||
)
|
||||
doc = a(doc)
|
||||
for i in range(len(doc)):
|
||||
if i != 2:
|
||||
assert doc[i].pos_ == ""
|
||||
assert str(doc[i].morph) == ""
|
||||
else:
|
||||
assert doc[2].pos_ == "DET"
|
||||
assert doc[2].lemma_ == "a"
|
||||
assert str(doc[2].morph) == "Case=Nom"
|
||||
ruler = AttributeRuler(nlp.vocab)
|
||||
ruler.load_from_morph_rules(morph_rules)
|
||||
check_morph_rules(ruler)
|
||||
|
||||
|
||||
def test_attributeruler_morph_rules_initialize(nlp, morph_rules):
|
||||
ruler = nlp.add_pipe("attribute_ruler")
|
||||
ruler.initialize(lambda: [], morph_rules=morph_rules)
|
||||
check_morph_rules(ruler)
|
||||
|
||||
|
||||
def test_attributeruler_indices(nlp):
|
||||
|
|
|
@ -4,6 +4,7 @@ tag: class
|
|||
source: spacy/pipeline/attributeruler.py
|
||||
new: 3
|
||||
teaser: 'Pipeline component for rule-based token attribute assignment'
|
||||
api_base_class: /api/pipe
|
||||
api_string_name: attribute_ruler
|
||||
api_trainable: false
|
||||
---
|
||||
|
@ -25,17 +26,13 @@ how the component should be configured. You can override its settings via the
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> config = {
|
||||
> "pattern_dicts": None,
|
||||
> "validate": True,
|
||||
> }
|
||||
> config = {"validate": True}
|
||||
> nlp.add_pipe("attribute_ruler", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `pattern_dicts` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](/api/attributeruler#add) (`patterns`/`attrs`/`index`) to add as patterns. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ |
|
||||
| `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ |
|
||||
| Setting | Description |
|
||||
| ---------- | --------------------------------------------------------------------------------------------- |
|
||||
| `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ |
|
||||
|
||||
```python
|
||||
%%GITHUB_SPACY/spacy/pipeline/attributeruler.py
|
||||
|
@ -43,36 +40,26 @@ how the component should be configured. You can override its settings via the
|
|||
|
||||
## AttributeRuler.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
Initialize the attribute ruler. If pattern dicts are supplied here, they need to
|
||||
be a list of dictionaries with `"patterns"`, `"attrs"`, and optional `"index"`
|
||||
keys, e.g.:
|
||||
|
||||
```python
|
||||
pattern_dicts = [
|
||||
{"patterns": [[{"TAG": "VB"}]], "attrs": {"POS": "VERB"}},
|
||||
{"patterns": [[{"LOWER": "an"}]], "attrs": {"LEMMA": "a"}},
|
||||
]
|
||||
```
|
||||
Initialize the attribute ruler.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> # Construction via add_pipe
|
||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||
> ruler = nlp.add_pipe("attribute_ruler")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary to pass to the matcher. ~~Vocab~~ |
|
||||
| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `pattern_dicts` | Optional patterns to load in on initialization. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ |
|
||||
| `validate` | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~ |
|
||||
| Name | Description |
|
||||
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary to pass to the matcher. ~~Vocab~~ |
|
||||
| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `validate` | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~ |
|
||||
|
||||
## AttributeRuler.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
Apply the attribute ruler to a `Doc`, setting token attributes for tokens matched
|
||||
by the provided patterns.
|
||||
Apply the attribute ruler to a `Doc`, setting token attributes for tokens
|
||||
matched by the provided patterns.
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------- |
|
||||
|
@ -90,10 +77,10 @@ may be negative to index from the end of the span.
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||
> ruler = nlp.add_pipe("attribute_ruler")
|
||||
> patterns = [[{"TAG": "VB"}]]
|
||||
> attrs = {"POS": "VERB"}
|
||||
> attribute_ruler.add(patterns=patterns, attrs=attrs)
|
||||
> ruler.add(patterns=patterns, attrs=attrs)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -107,11 +94,10 @@ may be negative to index from the end of the span.
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||
> pattern_dicts = [
|
||||
> ruler = nlp.add_pipe("attribute_ruler")
|
||||
> patterns = [
|
||||
> {
|
||||
> "patterns": [[{"TAG": "VB"}]],
|
||||
> "attrs": {"POS": "VERB"}
|
||||
> "patterns": [[{"TAG": "VB"}]], "attrs": {"POS": "VERB"}
|
||||
> },
|
||||
> {
|
||||
> "patterns": [[{"LOWER": "two"}, {"LOWER": "apples"}]],
|
||||
|
@ -119,15 +105,16 @@ may be negative to index from the end of the span.
|
|||
> "index": -1
|
||||
> },
|
||||
> ]
|
||||
> attribute_ruler.add_patterns(pattern_dicts)
|
||||
> ruler.add_patterns(patterns)
|
||||
> ```
|
||||
|
||||
Add patterns from a list of pattern dicts with the keys as the arguments to
|
||||
Add patterns from a list of pattern dicts. Each pattern dict can specify the
|
||||
keys `"patterns"`, `"attrs"` and `"index"`, which match the arguments of
|
||||
[`AttributeRuler.add`](/api/attributeruler#add).
|
||||
|
||||
| Name | Description |
|
||||
| --------------- | -------------------------------------------------------------------------- |
|
||||
| `pattern_dicts` | The patterns to add. ~~Iterable[Dict[str, Union[List[dict], dict, int]]]~~ |
|
||||
| Name | Description |
|
||||
| ---------- | -------------------------------------------------------------------------- |
|
||||
| `patterns` | The patterns to add. ~~Iterable[Dict[str, Union[List[dict], dict, int]]]~~ |
|
||||
|
||||
## AttributeRuler.patterns {#patterns tag="property"}
|
||||
|
||||
|
@ -139,20 +126,39 @@ Get all patterns that have been added to the attribute ruler in the
|
|||
| ----------- | -------------------------------------------------------------------------------------------- |
|
||||
| **RETURNS** | The patterns added to the attribute ruler. ~~List[Dict[str, Union[List[dict], dict, int]]]~~ |
|
||||
|
||||
## AttributeRuler.score {#score tag="method" new="3"}
|
||||
## AttributeRuler.initialize {#initialize tag="method"}
|
||||
|
||||
Score a batch of examples.
|
||||
Initialize the component with data. Typically called before training to load in
|
||||
rules from a file. This method is typically called by
|
||||
[`Language.initialize`](/api/language#initialize) and lets you customize
|
||||
arguments it receives via the
|
||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||
config.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> scores = attribute_ruler.score(examples)
|
||||
> ruler = nlp.add_pipe("attribute_ruler")
|
||||
> ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
|
||||
> ```
|
||||
>
|
||||
> ```ini
|
||||
> ### config.cfg
|
||||
> [initialize.components.attribute_ruler]
|
||||
>
|
||||
> [initialize.components.attribute_ruler.patterns]
|
||||
> @readers = "srsly.read_json.v1"
|
||||
> path = "corpus/attribute_ruler_patterns.json
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The examples to score. ~~Iterable[Example]~~ |
|
||||
| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag"`, `"pos"`, `"morph"` and `"lemma"` if present in any of the target token attributes. ~~Dict[str, float]~~ |
|
||||
| Name | Description |
|
||||
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects (the training data). Not used by this component. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| `patterns` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](/api/attributeruler#add) (`patterns`/`attrs`/`index`) to add as patterns. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ |
|
||||
| `tag_map` | The tag map that maps fine-grained tags to coarse-grained tags and morphological features. Defaults to `None`. ~~Optional[Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ |
|
||||
| `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]]~~ |
|
||||
|
||||
## AttributeRuler.load_from_tag_map {#load_from_tag_map tag="method"}
|
||||
|
||||
|
@ -170,6 +176,21 @@ Load attribute ruler patterns from morph rules.
|
|||
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. ~~Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ |
|
||||
|
||||
## AttributeRuler.score {#score tag="method" new="3"}
|
||||
|
||||
Score a batch of examples.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> scores = ruler.score(examples)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The examples to score. ~~Iterable[Example]~~ |
|
||||
| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag"`, `"pos"`, `"morph"` and `"lemma"` if present in any of the target token attributes. ~~Dict[str, float]~~ |
|
||||
|
||||
## AttributeRuler.to_disk {#to_disk tag="method"}
|
||||
|
||||
Serialize the pipe to disk.
|
||||
|
@ -177,8 +198,8 @@ Serialize the pipe to disk.
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||
> attribute_ruler.to_disk("/path/to/attribute_ruler")
|
||||
> ruler = nlp.add_pipe("attribute_ruler")
|
||||
> ruler.to_disk("/path/to/attribute_ruler")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -194,8 +215,8 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||
> attribute_ruler.from_disk("/path/to/attribute_ruler")
|
||||
> ruler = nlp.add_pipe("attribute_ruler")
|
||||
> ruler.from_disk("/path/to/attribute_ruler")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -210,8 +231,8 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||
> attribute_ruler_bytes = attribute_ruler.to_bytes()
|
||||
> ruler = nlp.add_pipe("attribute_ruler")
|
||||
> ruler = ruler.to_bytes()
|
||||
> ```
|
||||
|
||||
Serialize the pipe to a bytestring.
|
||||
|
@ -229,9 +250,9 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> attribute_ruler_bytes = attribute_ruler.to_bytes()
|
||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||
> attribute_ruler.from_bytes(attribute_ruler_bytes)
|
||||
> ruler_bytes = ruler.to_bytes()
|
||||
> ruler = nlp.add_pipe("attribute_ruler")
|
||||
> ruler.from_bytes(ruler_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -250,12 +271,12 @@ serialization by passing in the string names via the `exclude` argument.
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> data = attribute_ruler.to_disk("/path", exclude=["vocab"])
|
||||
> data = ruler.to_disk("/path", exclude=["vocab"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------- | -------------------------------------------------------------- |
|
||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||
| `patterns` | The `Matcher` patterns. You usually don't want to exclude this. |
|
||||
| `attrs` | The attributes to set. You usually don't want to exclude this. |
|
||||
| `indices` | The token indices. You usually don't want to exclude this. |
|
||||
| Name | Description |
|
||||
| ---------- | --------------------------------------------------------------- |
|
||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||
| `patterns` | The `Matcher` patterns. You usually don't want to exclude this. |
|
||||
| `attrs` | The attributes to set. You usually don't want to exclude this. |
|
||||
| `indices` | The token indices. You usually don't want to exclude this. |
|
||||
|
|
|
@ -1801,17 +1801,7 @@ print(doc2[5].tag_, doc2[5].pos_) # WP PRON
|
|||
|
||||
<Infobox variant="warning" title="Migrating from spaCy v2.x">
|
||||
|
||||
For easy migration from from spaCy v2 to v3, the
|
||||
[`AttributeRuler`](/api/attributeruler) can import a **tag map and morph rules**
|
||||
in the v2 format with the methods
|
||||
[`load_from_tag_map`](/api/attributeruler#load_from_tag_map) and
|
||||
[`load_from_morph_rules`](/api/attributeruler#load_from_morph_rules).
|
||||
|
||||
```diff
|
||||
nlp = spacy.blank("en")
|
||||
+ ruler = nlp.add_pipe("attribute_ruler")
|
||||
+ ruler.load_from_tag_map(YOUR_TAG_MAP)
|
||||
```
|
||||
The [`AttributeRuler`](/api/attributeruler) can import a **tag map and morph rules** in the v2.x format via its built-in methods or when the component is initialized before training. See the [migration guide](/usage/v3#migrating-training-mappings-exceptions) for details.
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
|
|
@ -804,8 +804,30 @@ nlp = spacy.blank("en")
|
|||
Instead of defining a `tag_map` and `morph_rules` in the language data, spaCy
|
||||
v3.0 now manages mappings and exceptions with a separate and more flexible
|
||||
pipeline component, the [`AttributeRuler`](/api/attributeruler). See the
|
||||
[usage guide](/usage/linguistic-features#mappings-exceptions) for examples. The
|
||||
`AttributeRuler` provides two handy helper methods
|
||||
[usage guide](/usage/linguistic-features#mappings-exceptions) for examples. If
|
||||
you have tag maps and morph rules in the v2.x format, you can load them into the
|
||||
attribute ruler before training using the `[initialize]` block of your config.
|
||||
|
||||
> #### What does the initialization do?
|
||||
>
|
||||
> The `[initialize]` block is used when
|
||||
> [`nlp.initialize`](/api/language#initialize) is called (usually right before
|
||||
> training). It lets you define data resources for initializing the pipeline in
|
||||
> your `config.cfg`. After training, the rules are saved to disk with the
|
||||
> exported pipeline, so your runtime model doesn't depend on local data. For
|
||||
> details see the [config lifecycle](/usage/training/#config-lifecycle) and
|
||||
> [initialization](/usage/training/#initialization) docs.
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt)
|
||||
[initialize.components.attribute_ruler]
|
||||
|
||||
[initialize.components.attribute_ruler.tag_map]
|
||||
@readers = "srsly.read_json.v1"
|
||||
path = "./corpus/tag_map.json"
|
||||
```
|
||||
|
||||
The `AttributeRuler` also provides two handy helper methods
|
||||
[`load_from_tag_map`](/api/attributeruler#load_from_tag_map) and
|
||||
[`load_from_morph_rules`](/api/attributeruler#load_from_morph_rules) that let
|
||||
you load in your existing tag map or morph rules:
|
||||
|
|
Loading…
Reference in New Issue
Block a user