mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Merge pull request #6192 from explosion/feature/init-attr-ruler
This commit is contained in:
commit
43d7652635
|
@ -1,10 +1,11 @@
|
||||||
|
from typing import List, Dict, Union, Iterable, Any, Optional, Callable, Iterator
|
||||||
|
from typing import Tuple
|
||||||
import srsly
|
import srsly
|
||||||
from typing import List, Dict, Union, Iterable, Any, Optional
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..training import validate_examples
|
from ..training import validate_examples, Example
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..matcher import Matcher
|
from ..matcher import Matcher
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
|
@ -18,20 +19,13 @@ from .. import util
|
||||||
|
|
||||||
MatcherPatternType = List[Dict[Union[int, str], Any]]
|
MatcherPatternType = List[Dict[Union[int, str], Any]]
|
||||||
AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]]
|
AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]]
|
||||||
|
TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]]
|
||||||
|
MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory("attribute_ruler", default_config={"validate": False})
|
||||||
"attribute_ruler", default_config={"pattern_dicts": None, "validate": False}
|
def make_attribute_ruler(nlp: Language, name: str, validate: bool):
|
||||||
)
|
return AttributeRuler(nlp.vocab, name, validate=validate)
|
||||||
def make_attribute_ruler(
|
|
||||||
nlp: Language,
|
|
||||||
name: str,
|
|
||||||
pattern_dicts: Optional[Iterable[AttributeRulerPatternType]],
|
|
||||||
validate: bool,
|
|
||||||
):
|
|
||||||
return AttributeRuler(
|
|
||||||
nlp.vocab, name, pattern_dicts=pattern_dicts, validate=validate
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class AttributeRuler(Pipe):
|
class AttributeRuler(Pipe):
|
||||||
|
@ -42,20 +36,15 @@ class AttributeRuler(Pipe):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self, vocab: Vocab, name: str = "attribute_ruler", *, validate: bool = False
|
||||||
vocab: Vocab,
|
|
||||||
name: str = "attribute_ruler",
|
|
||||||
*,
|
|
||||||
pattern_dicts: Optional[Iterable[AttributeRulerPatternType]] = None,
|
|
||||||
validate: bool = False,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize the AttributeRuler.
|
"""Create the AttributeRuler. After creation, you can add patterns
|
||||||
|
with the `.initialize()` or `.add_patterns()` methods, or load patterns
|
||||||
|
with `.from_bytes()` or `.from_disk()`. Loading patterns will remove
|
||||||
|
any patterns you've added previously.
|
||||||
|
|
||||||
vocab (Vocab): The vocab.
|
vocab (Vocab): The vocab.
|
||||||
name (str): The pipe name. Defaults to "attribute_ruler".
|
name (str): The pipe name. Defaults to "attribute_ruler".
|
||||||
pattern_dicts (Iterable[Dict]): A list of pattern dicts with the keys as
|
|
||||||
the arguments to AttributeRuler.add (`patterns`/`attrs`/`index`) to add
|
|
||||||
as patterns.
|
|
||||||
|
|
||||||
RETURNS (AttributeRuler): The AttributeRuler component.
|
RETURNS (AttributeRuler): The AttributeRuler component.
|
||||||
|
|
||||||
|
@ -68,8 +57,27 @@ class AttributeRuler(Pipe):
|
||||||
self._attrs_unnormed = [] # store for reference
|
self._attrs_unnormed = [] # store for reference
|
||||||
self.indices = []
|
self.indices = []
|
||||||
|
|
||||||
if pattern_dicts:
|
def initialize(
|
||||||
self.add_patterns(pattern_dicts)
|
self,
|
||||||
|
get_examples: Optional[Callable[[], Iterable[Example]]],
|
||||||
|
*,
|
||||||
|
nlp: Optional[Language] = None,
|
||||||
|
patterns: Optional[Iterable[AttributeRulerPatternType]] = None,
|
||||||
|
tag_map: Optional[TagMapType] = None,
|
||||||
|
morph_rules: Optional[MorphRulesType] = None,
|
||||||
|
):
|
||||||
|
"""Initialize the attribute ruler by adding zero or more patterns.
|
||||||
|
|
||||||
|
Rules can be specified as a sequence of dicts using the `patterns`
|
||||||
|
keyword argument. You can also provide rules using the "tag map" or
|
||||||
|
"morph rules" formats supported by spaCy prior to v3.
|
||||||
|
"""
|
||||||
|
if patterns:
|
||||||
|
self.add_patterns(patterns)
|
||||||
|
if tag_map:
|
||||||
|
self.load_from_tag_map(tag_map)
|
||||||
|
if morph_rules:
|
||||||
|
self.load_from_morph_rules(morph_rules)
|
||||||
|
|
||||||
def __call__(self, doc: Doc) -> Doc:
|
def __call__(self, doc: Doc) -> Doc:
|
||||||
"""Apply the AttributeRuler to a Doc and set all attribute exceptions.
|
"""Apply the AttributeRuler to a Doc and set all attribute exceptions.
|
||||||
|
@ -106,7 +114,7 @@ class AttributeRuler(Pipe):
|
||||||
set_token_attrs(span[index], attrs)
|
set_token_attrs(span[index], attrs)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, *, batch_size=128):
|
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
||||||
"""Apply the pipe to a stream of documents. This usually happens under
|
"""Apply the pipe to a stream of documents. This usually happens under
|
||||||
the hood when the nlp object is called on a text and all components are
|
the hood when the nlp object is called on a text and all components are
|
||||||
applied to the Doc.
|
applied to the Doc.
|
||||||
|
@ -190,16 +198,16 @@ class AttributeRuler(Pipe):
|
||||||
self.attrs.append(attrs)
|
self.attrs.append(attrs)
|
||||||
self.indices.append(index)
|
self.indices.append(index)
|
||||||
|
|
||||||
def add_patterns(self, pattern_dicts: Iterable[AttributeRulerPatternType]) -> None:
|
def add_patterns(self, patterns: Iterable[AttributeRulerPatternType]) -> None:
|
||||||
"""Add patterns from a list of pattern dicts with the keys as the
|
"""Add patterns from a list of pattern dicts with the keys as the
|
||||||
arguments to AttributeRuler.add.
|
arguments to AttributeRuler.add.
|
||||||
pattern_dicts (Iterable[dict]): A list of pattern dicts with the keys
|
patterns (Iterable[dict]): A list of pattern dicts with the keys
|
||||||
as the arguments to AttributeRuler.add (patterns/attrs/index) to
|
as the arguments to AttributeRuler.add (patterns/attrs/index) to
|
||||||
add as patterns.
|
add as patterns.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/attributeruler#add_patterns
|
DOCS: https://nightly.spacy.io/api/attributeruler#add_patterns
|
||||||
"""
|
"""
|
||||||
for p in pattern_dicts:
|
for p in patterns:
|
||||||
self.add(**p)
|
self.add(**p)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -214,7 +222,7 @@ class AttributeRuler(Pipe):
|
||||||
all_patterns.append(p)
|
all_patterns.append(p)
|
||||||
return all_patterns
|
return all_patterns
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
"""Score a batch of examples.
|
"""Score a batch of examples.
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
examples (Iterable[Example]): The examples to score.
|
||||||
|
@ -255,7 +263,7 @@ class AttributeRuler(Pipe):
|
||||||
|
|
||||||
def from_bytes(
|
def from_bytes(
|
||||||
self, bytes_data: bytes, exclude: Iterable[str] = SimpleFrozenList()
|
self, bytes_data: bytes, exclude: Iterable[str] = SimpleFrozenList()
|
||||||
):
|
) -> "AttributeRuler":
|
||||||
"""Load the AttributeRuler from a bytestring.
|
"""Load the AttributeRuler from a bytestring.
|
||||||
|
|
||||||
bytes_data (bytes): The data to load.
|
bytes_data (bytes): The data to load.
|
||||||
|
@ -273,7 +281,6 @@ class AttributeRuler(Pipe):
|
||||||
"patterns": load_patterns,
|
"patterns": load_patterns,
|
||||||
}
|
}
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_disk(
|
def to_disk(
|
||||||
|
@ -283,6 +290,7 @@ class AttributeRuler(Pipe):
|
||||||
|
|
||||||
path (Union[Path, str]): A path to a directory.
|
path (Union[Path, str]): A path to a directory.
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/attributeruler#to_disk
|
DOCS: https://nightly.spacy.io/api/attributeruler#to_disk
|
||||||
"""
|
"""
|
||||||
serialize = {
|
serialize = {
|
||||||
|
@ -293,11 +301,13 @@ class AttributeRuler(Pipe):
|
||||||
|
|
||||||
def from_disk(
|
def from_disk(
|
||||||
self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList()
|
self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList()
|
||||||
) -> None:
|
) -> "AttributeRuler":
|
||||||
"""Load the AttributeRuler from disk.
|
"""Load the AttributeRuler from disk.
|
||||||
|
|
||||||
path (Union[Path, str]): A path to a directory.
|
path (Union[Path, str]): A path to a directory.
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
RETURNS (AttributeRuler): The loaded object.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/attributeruler#from_disk
|
DOCS: https://nightly.spacy.io/api/attributeruler#from_disk
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -309,11 +319,10 @@ class AttributeRuler(Pipe):
|
||||||
"patterns": load_patterns,
|
"patterns": load_patterns,
|
||||||
}
|
}
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
def _split_morph_attrs(attrs):
|
def _split_morph_attrs(attrs: dict) -> Tuple[dict, dict]:
|
||||||
"""Split entries from a tag map or morph rules dict into to two dicts, one
|
"""Split entries from a tag map or morph rules dict into to two dicts, one
|
||||||
with the token-level features (POS, LEMMA) and one with the remaining
|
with the token-level features (POS, LEMMA) and one with the remaining
|
||||||
features, which are presumed to be individual MORPH features."""
|
features, which are presumed to be individual MORPH features."""
|
||||||
|
|
|
@ -63,6 +63,39 @@ def morph_rules():
|
||||||
return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}}
|
return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}}
|
||||||
|
|
||||||
|
|
||||||
|
def check_tag_map(ruler):
|
||||||
|
doc = Doc(
|
||||||
|
ruler.vocab,
|
||||||
|
words=["This", "is", "a", "test", "."],
|
||||||
|
tags=["DT", "VBZ", "DT", "NN", "."],
|
||||||
|
)
|
||||||
|
doc = ruler(doc)
|
||||||
|
for i in range(len(doc)):
|
||||||
|
if i == 4:
|
||||||
|
assert doc[i].pos_ == "PUNCT"
|
||||||
|
assert str(doc[i].morph) == "PunctType=peri"
|
||||||
|
else:
|
||||||
|
assert doc[i].pos_ == ""
|
||||||
|
assert str(doc[i].morph) == ""
|
||||||
|
|
||||||
|
|
||||||
|
def check_morph_rules(ruler):
|
||||||
|
doc = Doc(
|
||||||
|
ruler.vocab,
|
||||||
|
words=["This", "is", "the", "test", "."],
|
||||||
|
tags=["DT", "VBZ", "DT", "NN", "."],
|
||||||
|
)
|
||||||
|
doc = ruler(doc)
|
||||||
|
for i in range(len(doc)):
|
||||||
|
if i != 2:
|
||||||
|
assert doc[i].pos_ == ""
|
||||||
|
assert str(doc[i].morph) == ""
|
||||||
|
else:
|
||||||
|
assert doc[2].pos_ == "DET"
|
||||||
|
assert doc[2].lemma_ == "a"
|
||||||
|
assert str(doc[2].morph) == "Case=Nom"
|
||||||
|
|
||||||
|
|
||||||
def test_attributeruler_init(nlp, pattern_dicts):
|
def test_attributeruler_init(nlp, pattern_dicts):
|
||||||
a = nlp.add_pipe("attribute_ruler")
|
a = nlp.add_pipe("attribute_ruler")
|
||||||
for p in pattern_dicts:
|
for p in pattern_dicts:
|
||||||
|
@ -78,7 +111,8 @@ def test_attributeruler_init(nlp, pattern_dicts):
|
||||||
|
|
||||||
def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
||||||
# initialize with patterns
|
# initialize with patterns
|
||||||
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
ruler = nlp.add_pipe("attribute_ruler")
|
||||||
|
ruler.initialize(lambda: [], patterns=pattern_dicts)
|
||||||
doc = nlp("This is a test.")
|
doc = nlp("This is a test.")
|
||||||
assert doc[2].lemma_ == "the"
|
assert doc[2].lemma_ == "the"
|
||||||
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||||
|
@ -88,10 +122,11 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
||||||
assert doc.has_annotation("MORPH")
|
assert doc.has_annotation("MORPH")
|
||||||
nlp.remove_pipe("attribute_ruler")
|
nlp.remove_pipe("attribute_ruler")
|
||||||
# initialize with patterns from asset
|
# initialize with patterns from asset
|
||||||
nlp.add_pipe(
|
nlp.config["initialize"]["components"]["attribute_ruler"] = {
|
||||||
"attribute_ruler",
|
"patterns": {"@misc": "attribute_ruler_patterns"}
|
||||||
config={"pattern_dicts": {"@misc": "attribute_ruler_patterns"}},
|
}
|
||||||
)
|
nlp.add_pipe("attribute_ruler")
|
||||||
|
nlp.initialize()
|
||||||
doc = nlp("This is a test.")
|
doc = nlp("This is a test.")
|
||||||
assert doc[2].lemma_ == "the"
|
assert doc[2].lemma_ == "the"
|
||||||
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||||
|
@ -103,18 +138,15 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
||||||
|
|
||||||
def test_attributeruler_score(nlp, pattern_dicts):
|
def test_attributeruler_score(nlp, pattern_dicts):
|
||||||
# initialize with patterns
|
# initialize with patterns
|
||||||
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
ruler = nlp.add_pipe("attribute_ruler")
|
||||||
|
ruler.initialize(lambda: [], patterns=pattern_dicts)
|
||||||
doc = nlp("This is a test.")
|
doc = nlp("This is a test.")
|
||||||
assert doc[2].lemma_ == "the"
|
assert doc[2].lemma_ == "the"
|
||||||
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||||
assert doc[3].lemma_ == "cat"
|
assert doc[3].lemma_ == "cat"
|
||||||
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
|
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
|
||||||
|
doc = nlp.make_doc("This is a test.")
|
||||||
dev_examples = [
|
dev_examples = [Example.from_dict(doc, {"lemmas": ["this", "is", "a", "cat", "."]})]
|
||||||
Example.from_dict(
|
|
||||||
nlp.make_doc("This is a test."), {"lemmas": ["this", "is", "a", "cat", "."]}
|
|
||||||
)
|
|
||||||
]
|
|
||||||
scores = nlp.evaluate(dev_examples)
|
scores = nlp.evaluate(dev_examples)
|
||||||
# "cat" is the only correct lemma
|
# "cat" is the only correct lemma
|
||||||
assert scores["lemma_acc"] == pytest.approx(0.2)
|
assert scores["lemma_acc"] == pytest.approx(0.2)
|
||||||
|
@ -139,40 +171,27 @@ def test_attributeruler_rule_order(nlp):
|
||||||
|
|
||||||
|
|
||||||
def test_attributeruler_tag_map(nlp, tag_map):
|
def test_attributeruler_tag_map(nlp, tag_map):
|
||||||
a = AttributeRuler(nlp.vocab)
|
ruler = AttributeRuler(nlp.vocab)
|
||||||
a.load_from_tag_map(tag_map)
|
ruler.load_from_tag_map(tag_map)
|
||||||
doc = Doc(
|
check_tag_map(ruler)
|
||||||
nlp.vocab,
|
|
||||||
words=["This", "is", "a", "test", "."],
|
|
||||||
tags=["DT", "VBZ", "DT", "NN", "."],
|
def test_attributeruler_tag_map_initialize(nlp, tag_map):
|
||||||
)
|
ruler = nlp.add_pipe("attribute_ruler")
|
||||||
doc = a(doc)
|
ruler.initialize(lambda: [], tag_map=tag_map)
|
||||||
for i in range(len(doc)):
|
check_tag_map(ruler)
|
||||||
if i == 4:
|
|
||||||
assert doc[i].pos_ == "PUNCT"
|
|
||||||
assert str(doc[i].morph) == "PunctType=peri"
|
|
||||||
else:
|
|
||||||
assert doc[i].pos_ == ""
|
|
||||||
assert str(doc[i].morph) == ""
|
|
||||||
|
|
||||||
|
|
||||||
def test_attributeruler_morph_rules(nlp, morph_rules):
|
def test_attributeruler_morph_rules(nlp, morph_rules):
|
||||||
a = AttributeRuler(nlp.vocab)
|
ruler = AttributeRuler(nlp.vocab)
|
||||||
a.load_from_morph_rules(morph_rules)
|
ruler.load_from_morph_rules(morph_rules)
|
||||||
doc = Doc(
|
check_morph_rules(ruler)
|
||||||
nlp.vocab,
|
|
||||||
words=["This", "is", "the", "test", "."],
|
|
||||||
tags=["DT", "VBZ", "DT", "NN", "."],
|
def test_attributeruler_morph_rules_initialize(nlp, morph_rules):
|
||||||
)
|
ruler = nlp.add_pipe("attribute_ruler")
|
||||||
doc = a(doc)
|
ruler.initialize(lambda: [], morph_rules=morph_rules)
|
||||||
for i in range(len(doc)):
|
check_morph_rules(ruler)
|
||||||
if i != 2:
|
|
||||||
assert doc[i].pos_ == ""
|
|
||||||
assert str(doc[i].morph) == ""
|
|
||||||
else:
|
|
||||||
assert doc[2].pos_ == "DET"
|
|
||||||
assert doc[2].lemma_ == "a"
|
|
||||||
assert str(doc[2].morph) == "Case=Nom"
|
|
||||||
|
|
||||||
|
|
||||||
def test_attributeruler_indices(nlp):
|
def test_attributeruler_indices(nlp):
|
||||||
|
|
|
@ -4,6 +4,7 @@ tag: class
|
||||||
source: spacy/pipeline/attributeruler.py
|
source: spacy/pipeline/attributeruler.py
|
||||||
new: 3
|
new: 3
|
||||||
teaser: 'Pipeline component for rule-based token attribute assignment'
|
teaser: 'Pipeline component for rule-based token attribute assignment'
|
||||||
|
api_base_class: /api/pipe
|
||||||
api_string_name: attribute_ruler
|
api_string_name: attribute_ruler
|
||||||
api_trainable: false
|
api_trainable: false
|
||||||
---
|
---
|
||||||
|
@ -25,17 +26,13 @@ how the component should be configured. You can override its settings via the
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> config = {
|
> config = {"validate": True}
|
||||||
> "pattern_dicts": None,
|
|
||||||
> "validate": True,
|
|
||||||
> }
|
|
||||||
> nlp.add_pipe("attribute_ruler", config=config)
|
> nlp.add_pipe("attribute_ruler", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Setting | Description |
|
| Setting | Description |
|
||||||
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------- | --------------------------------------------------------------------------------------------- |
|
||||||
| `pattern_dicts` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](/api/attributeruler#add) (`patterns`/`attrs`/`index`) to add as patterns. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ |
|
| `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ |
|
||||||
| `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ |
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
%%GITHUB_SPACY/spacy/pipeline/attributeruler.py
|
%%GITHUB_SPACY/spacy/pipeline/attributeruler.py
|
||||||
|
@ -43,36 +40,26 @@ how the component should be configured. You can override its settings via the
|
||||||
|
|
||||||
## AttributeRuler.\_\_init\_\_ {#init tag="method"}
|
## AttributeRuler.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
Initialize the attribute ruler. If pattern dicts are supplied here, they need to
|
Initialize the attribute ruler.
|
||||||
be a list of dictionaries with `"patterns"`, `"attrs"`, and optional `"index"`
|
|
||||||
keys, e.g.:
|
|
||||||
|
|
||||||
```python
|
|
||||||
pattern_dicts = [
|
|
||||||
{"patterns": [[{"TAG": "VB"}]], "attrs": {"POS": "VERB"}},
|
|
||||||
{"patterns": [[{"LOWER": "an"}]], "attrs": {"LEMMA": "a"}},
|
|
||||||
]
|
|
||||||
```
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> # Construction via add_pipe
|
> # Construction via add_pipe
|
||||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
> ruler = nlp.add_pipe("attribute_ruler")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | The shared vocabulary to pass to the matcher. ~~Vocab~~ |
|
| `vocab` | The shared vocabulary to pass to the matcher. ~~Vocab~~ |
|
||||||
| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ |
|
| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `pattern_dicts` | Optional patterns to load in on initialization. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ |
|
| `validate` | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~ |
|
||||||
| `validate` | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~ |
|
|
||||||
|
|
||||||
## AttributeRuler.\_\_call\_\_ {#call tag="method"}
|
## AttributeRuler.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
Apply the attribute ruler to a `Doc`, setting token attributes for tokens matched
|
Apply the attribute ruler to a `Doc`, setting token attributes for tokens
|
||||||
by the provided patterns.
|
matched by the provided patterns.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | -------------------------------- |
|
| ----------- | -------------------------------- |
|
||||||
|
@ -90,10 +77,10 @@ may be negative to index from the end of the span.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
> ruler = nlp.add_pipe("attribute_ruler")
|
||||||
> patterns = [[{"TAG": "VB"}]]
|
> patterns = [[{"TAG": "VB"}]]
|
||||||
> attrs = {"POS": "VERB"}
|
> attrs = {"POS": "VERB"}
|
||||||
> attribute_ruler.add(patterns=patterns, attrs=attrs)
|
> ruler.add(patterns=patterns, attrs=attrs)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -107,11 +94,10 @@ may be negative to index from the end of the span.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
> ruler = nlp.add_pipe("attribute_ruler")
|
||||||
> pattern_dicts = [
|
> patterns = [
|
||||||
> {
|
> {
|
||||||
> "patterns": [[{"TAG": "VB"}]],
|
> "patterns": [[{"TAG": "VB"}]], "attrs": {"POS": "VERB"}
|
||||||
> "attrs": {"POS": "VERB"}
|
|
||||||
> },
|
> },
|
||||||
> {
|
> {
|
||||||
> "patterns": [[{"LOWER": "two"}, {"LOWER": "apples"}]],
|
> "patterns": [[{"LOWER": "two"}, {"LOWER": "apples"}]],
|
||||||
|
@ -119,15 +105,16 @@ may be negative to index from the end of the span.
|
||||||
> "index": -1
|
> "index": -1
|
||||||
> },
|
> },
|
||||||
> ]
|
> ]
|
||||||
> attribute_ruler.add_patterns(pattern_dicts)
|
> ruler.add_patterns(patterns)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Add patterns from a list of pattern dicts with the keys as the arguments to
|
Add patterns from a list of pattern dicts. Each pattern dict can specify the
|
||||||
|
keys `"patterns"`, `"attrs"` and `"index"`, which match the arguments of
|
||||||
[`AttributeRuler.add`](/api/attributeruler#add).
|
[`AttributeRuler.add`](/api/attributeruler#add).
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------- | -------------------------------------------------------------------------- |
|
| ---------- | -------------------------------------------------------------------------- |
|
||||||
| `pattern_dicts` | The patterns to add. ~~Iterable[Dict[str, Union[List[dict], dict, int]]]~~ |
|
| `patterns` | The patterns to add. ~~Iterable[Dict[str, Union[List[dict], dict, int]]]~~ |
|
||||||
|
|
||||||
## AttributeRuler.patterns {#patterns tag="property"}
|
## AttributeRuler.patterns {#patterns tag="property"}
|
||||||
|
|
||||||
|
@ -139,20 +126,39 @@ Get all patterns that have been added to the attribute ruler in the
|
||||||
| ----------- | -------------------------------------------------------------------------------------------- |
|
| ----------- | -------------------------------------------------------------------------------------------- |
|
||||||
| **RETURNS** | The patterns added to the attribute ruler. ~~List[Dict[str, Union[List[dict], dict, int]]]~~ |
|
| **RETURNS** | The patterns added to the attribute ruler. ~~List[Dict[str, Union[List[dict], dict, int]]]~~ |
|
||||||
|
|
||||||
## AttributeRuler.score {#score tag="method" new="3"}
|
## AttributeRuler.initialize {#initialize tag="method"}
|
||||||
|
|
||||||
Score a batch of examples.
|
Initialize the component with data. Typically called before training to load in
|
||||||
|
rules from a file. This method is typically called by
|
||||||
|
[`Language.initialize`](/api/language#initialize) and lets you customize
|
||||||
|
arguments it receives via the
|
||||||
|
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||||
|
config.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> scores = attribute_ruler.score(examples)
|
> ruler = nlp.add_pipe("attribute_ruler")
|
||||||
|
> ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> ### config.cfg
|
||||||
|
> [initialize.components.attribute_ruler]
|
||||||
|
>
|
||||||
|
> [initialize.components.attribute_ruler.patterns]
|
||||||
|
> @readers = "srsly.read_json.v1"
|
||||||
|
> path = "corpus/attribute_ruler_patterns.json
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `examples` | The examples to score. ~~Iterable[Example]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects (the training data). Not used by this component. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag"`, `"pos"`, `"morph"` and `"lemma"` if present in any of the target token attributes. ~~Dict[str, float]~~ |
|
| _keyword-only_ | |
|
||||||
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
|
| `patterns` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](/api/attributeruler#add) (`patterns`/`attrs`/`index`) to add as patterns. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ |
|
||||||
|
| `tag_map` | The tag map that maps fine-grained tags to coarse-grained tags and morphological features. Defaults to `None`. ~~Optional[Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ |
|
||||||
|
| `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]]~~ |
|
||||||
|
|
||||||
## AttributeRuler.load_from_tag_map {#load_from_tag_map tag="method"}
|
## AttributeRuler.load_from_tag_map {#load_from_tag_map tag="method"}
|
||||||
|
|
||||||
|
@ -170,6 +176,21 @@ Load attribute ruler patterns from morph rules.
|
||||||
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. ~~Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ |
|
| `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. ~~Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ |
|
||||||
|
|
||||||
|
## AttributeRuler.score {#score tag="method" new="3"}
|
||||||
|
|
||||||
|
Score a batch of examples.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> scores = ruler.score(examples)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `examples` | The examples to score. ~~Iterable[Example]~~ |
|
||||||
|
| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag"`, `"pos"`, `"morph"` and `"lemma"` if present in any of the target token attributes. ~~Dict[str, float]~~ |
|
||||||
|
|
||||||
## AttributeRuler.to_disk {#to_disk tag="method"}
|
## AttributeRuler.to_disk {#to_disk tag="method"}
|
||||||
|
|
||||||
Serialize the pipe to disk.
|
Serialize the pipe to disk.
|
||||||
|
@ -177,8 +198,8 @@ Serialize the pipe to disk.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
> ruler = nlp.add_pipe("attribute_ruler")
|
||||||
> attribute_ruler.to_disk("/path/to/attribute_ruler")
|
> ruler.to_disk("/path/to/attribute_ruler")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -194,8 +215,8 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
> ruler = nlp.add_pipe("attribute_ruler")
|
||||||
> attribute_ruler.from_disk("/path/to/attribute_ruler")
|
> ruler.from_disk("/path/to/attribute_ruler")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -210,8 +231,8 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
> ruler = nlp.add_pipe("attribute_ruler")
|
||||||
> attribute_ruler_bytes = attribute_ruler.to_bytes()
|
> ruler = ruler.to_bytes()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Serialize the pipe to a bytestring.
|
Serialize the pipe to a bytestring.
|
||||||
|
@ -229,9 +250,9 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> attribute_ruler_bytes = attribute_ruler.to_bytes()
|
> ruler_bytes = ruler.to_bytes()
|
||||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
> ruler = nlp.add_pipe("attribute_ruler")
|
||||||
> attribute_ruler.from_bytes(attribute_ruler_bytes)
|
> ruler.from_bytes(ruler_bytes)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -250,12 +271,12 @@ serialization by passing in the string names via the `exclude` argument.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> data = attribute_ruler.to_disk("/path", exclude=["vocab"])
|
> data = ruler.to_disk("/path", exclude=["vocab"])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------- | -------------------------------------------------------------- |
|
| ---------- | --------------------------------------------------------------- |
|
||||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||||
| `patterns` | The `Matcher` patterns. You usually don't want to exclude this. |
|
| `patterns` | The `Matcher` patterns. You usually don't want to exclude this. |
|
||||||
| `attrs` | The attributes to set. You usually don't want to exclude this. |
|
| `attrs` | The attributes to set. You usually don't want to exclude this. |
|
||||||
| `indices` | The token indices. You usually don't want to exclude this. |
|
| `indices` | The token indices. You usually don't want to exclude this. |
|
||||||
|
|
|
@ -1801,17 +1801,7 @@ print(doc2[5].tag_, doc2[5].pos_) # WP PRON
|
||||||
|
|
||||||
<Infobox variant="warning" title="Migrating from spaCy v2.x">
|
<Infobox variant="warning" title="Migrating from spaCy v2.x">
|
||||||
|
|
||||||
For easy migration from from spaCy v2 to v3, the
|
The [`AttributeRuler`](/api/attributeruler) can import a **tag map and morph rules** in the v2.x format via its built-in methods or when the component is initialized before training. See the [migration guide](/usage/v3#migrating-training-mappings-exceptions) for details.
|
||||||
[`AttributeRuler`](/api/attributeruler) can import a **tag map and morph rules**
|
|
||||||
in the v2 format with the methods
|
|
||||||
[`load_from_tag_map`](/api/attributeruler#load_from_tag_map) and
|
|
||||||
[`load_from_morph_rules`](/api/attributeruler#load_from_morph_rules).
|
|
||||||
|
|
||||||
```diff
|
|
||||||
nlp = spacy.blank("en")
|
|
||||||
+ ruler = nlp.add_pipe("attribute_ruler")
|
|
||||||
+ ruler.load_from_tag_map(YOUR_TAG_MAP)
|
|
||||||
```
|
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
|
|
@ -804,8 +804,30 @@ nlp = spacy.blank("en")
|
||||||
Instead of defining a `tag_map` and `morph_rules` in the language data, spaCy
|
Instead of defining a `tag_map` and `morph_rules` in the language data, spaCy
|
||||||
v3.0 now manages mappings and exceptions with a separate and more flexible
|
v3.0 now manages mappings and exceptions with a separate and more flexible
|
||||||
pipeline component, the [`AttributeRuler`](/api/attributeruler). See the
|
pipeline component, the [`AttributeRuler`](/api/attributeruler). See the
|
||||||
[usage guide](/usage/linguistic-features#mappings-exceptions) for examples. The
|
[usage guide](/usage/linguistic-features#mappings-exceptions) for examples. If
|
||||||
`AttributeRuler` provides two handy helper methods
|
you have tag maps and morph rules in the v2.x format, you can load them into the
|
||||||
|
attribute ruler before training using the `[initialize]` block of your config.
|
||||||
|
|
||||||
|
> #### What does the initialization do?
|
||||||
|
>
|
||||||
|
> The `[initialize]` block is used when
|
||||||
|
> [`nlp.initialize`](/api/language#initialize) is called (usually right before
|
||||||
|
> training). It lets you define data resources for initializing the pipeline in
|
||||||
|
> your `config.cfg`. After training, the rules are saved to disk with the
|
||||||
|
> exported pipeline, so your runtime model doesn't depend on local data. For
|
||||||
|
> details see the [config lifecycle](/usage/training/#config-lifecycle) and
|
||||||
|
> [initialization](/usage/training/#initialization) docs.
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### config.cfg (excerpt)
|
||||||
|
[initialize.components.attribute_ruler]
|
||||||
|
|
||||||
|
[initialize.components.attribute_ruler.tag_map]
|
||||||
|
@readers = "srsly.read_json.v1"
|
||||||
|
path = "./corpus/tag_map.json"
|
||||||
|
```
|
||||||
|
|
||||||
|
The `AttributeRuler` also provides two handy helper methods
|
||||||
[`load_from_tag_map`](/api/attributeruler#load_from_tag_map) and
|
[`load_from_tag_map`](/api/attributeruler#load_from_tag_map) and
|
||||||
[`load_from_morph_rules`](/api/attributeruler#load_from_morph_rules) that let
|
[`load_from_morph_rules`](/api/attributeruler#load_from_morph_rules) that let
|
||||||
you load in your existing tag map or morph rules:
|
you load in your existing tag map or morph rules:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user