mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
Merge pull request #5891 from adrianeboyd/docs/attribute-ruler-api
Add AttributeRuler API docs
This commit is contained in:
commit
4ca08c6d5d
|
@ -63,7 +63,7 @@ class AttributeRuler(Pipe):
|
||||||
self.add_patterns(pattern_dicts)
|
self.add_patterns(pattern_dicts)
|
||||||
|
|
||||||
def __call__(self, doc: Doc) -> Doc:
|
def __call__(self, doc: Doc) -> Doc:
|
||||||
"""Apply the attributeruler to a Doc and set all attribute exceptions.
|
"""Apply the AttributeRuler to a Doc and set all attribute exceptions.
|
||||||
|
|
||||||
doc (Doc): The document to process.
|
doc (Doc): The document to process.
|
||||||
RETURNS (Doc): The processed Doc.
|
RETURNS (Doc): The processed Doc.
|
||||||
|
@ -107,6 +107,13 @@ class AttributeRuler(Pipe):
|
||||||
def load_from_tag_map(
|
def load_from_tag_map(
|
||||||
self, tag_map: Dict[str, Dict[Union[int, str], Union[int, str]]]
|
self, tag_map: Dict[str, Dict[Union[int, str], Union[int, str]]]
|
||||||
) -> None:
|
) -> None:
|
||||||
|
"""Load attribute ruler patterns from a tag map.
|
||||||
|
|
||||||
|
tag_map (dict): The tag map that maps fine-grained tags to
|
||||||
|
coarse-grained tags and morphological features.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/attributeruler#load_from_morph_rules
|
||||||
|
"""
|
||||||
for tag, attrs in tag_map.items():
|
for tag, attrs in tag_map.items():
|
||||||
pattern = [{"TAG": tag}]
|
pattern = [{"TAG": tag}]
|
||||||
attrs, morph_attrs = _split_morph_attrs(attrs)
|
attrs, morph_attrs = _split_morph_attrs(attrs)
|
||||||
|
@ -117,6 +124,14 @@ class AttributeRuler(Pipe):
|
||||||
def load_from_morph_rules(
|
def load_from_morph_rules(
|
||||||
self, morph_rules: Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
|
self, morph_rules: Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
|
||||||
) -> None:
|
) -> None:
|
||||||
|
"""Load attribute ruler patterns from morph rules.
|
||||||
|
|
||||||
|
morph_rules (dict): The morph rules that map token text and
|
||||||
|
fine-grained tags to coarse-grained tags, lemmas and morphological
|
||||||
|
features.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/attributeruler#load_from_morph_rules
|
||||||
|
"""
|
||||||
for tag in morph_rules:
|
for tag in morph_rules:
|
||||||
for word in morph_rules[tag]:
|
for word in morph_rules[tag]:
|
||||||
pattern = [{"ORTH": word, "TAG": tag}]
|
pattern = [{"ORTH": word, "TAG": tag}]
|
||||||
|
@ -148,11 +163,20 @@ class AttributeRuler(Pipe):
|
||||||
self.indices.append(index)
|
self.indices.append(index)
|
||||||
|
|
||||||
def add_patterns(self, pattern_dicts: Iterable[AttributeRulerPatternType]) -> None:
|
def add_patterns(self, pattern_dicts: Iterable[AttributeRulerPatternType]) -> None:
|
||||||
|
"""Add patterns from a list of pattern dicts with the keys as the
|
||||||
|
arguments to AttributeRuler.add.
|
||||||
|
pattern_dicts (Iterable[dict]): A list of pattern dicts with the keys
|
||||||
|
as the arguments to AttributeRuler.add (patterns/attrs/index) to
|
||||||
|
add as patterns.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/attributeruler#add_patterns
|
||||||
|
"""
|
||||||
for p in pattern_dicts:
|
for p in pattern_dicts:
|
||||||
self.add(**p)
|
self.add(**p)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def patterns(self) -> List[AttributeRulerPatternType]:
|
def patterns(self) -> List[AttributeRulerPatternType]:
|
||||||
|
"""All the added patterns."""
|
||||||
all_patterns = []
|
all_patterns = []
|
||||||
for i in range(len(self.attrs)):
|
for i in range(len(self.attrs)):
|
||||||
p = {}
|
p = {}
|
||||||
|
@ -163,7 +187,7 @@ class AttributeRuler(Pipe):
|
||||||
return all_patterns
|
return all_patterns
|
||||||
|
|
||||||
def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes:
|
def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes:
|
||||||
"""Serialize the attributeruler to a bytestring.
|
"""Serialize the AttributeRuler to a bytestring.
|
||||||
|
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized object.
|
RETURNS (bytes): The serialized object.
|
||||||
|
@ -179,7 +203,7 @@ class AttributeRuler(Pipe):
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data: bytes, exclude: Iterable[str] = tuple()):
|
def from_bytes(self, bytes_data: bytes, exclude: Iterable[str] = tuple()):
|
||||||
"""Load the attributeruler from a bytestring.
|
"""Load the AttributeRuler from a bytestring.
|
||||||
|
|
||||||
bytes_data (bytes): The data to load.
|
bytes_data (bytes): The data to load.
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
@ -215,7 +239,7 @@ class AttributeRuler(Pipe):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()) -> None:
|
def to_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()) -> None:
|
||||||
"""Serialize the attributeruler to disk.
|
"""Serialize the AttributeRuler to disk.
|
||||||
|
|
||||||
path (Union[Path, str]): A path to a directory.
|
path (Union[Path, str]): A path to a directory.
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
@ -233,7 +257,7 @@ class AttributeRuler(Pipe):
|
||||||
def from_disk(
|
def from_disk(
|
||||||
self, path: Union[Path, str], exclude: Iterable[str] = tuple()
|
self, path: Union[Path, str], exclude: Iterable[str] = tuple()
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Load the attributeruler from disk.
|
"""Load the AttributeRuler from disk.
|
||||||
|
|
||||||
path (Union[Path, str]): A path to a directory.
|
path (Union[Path, str]): A path to a directory.
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
|
245
website/docs/api/attributeruler.md
Normal file
245
website/docs/api/attributeruler.md
Normal file
|
@ -0,0 +1,245 @@
|
||||||
|
---
|
||||||
|
title: AttributeRuler
|
||||||
|
tag: class
|
||||||
|
source: spacy/pipeline/attributeruler.py
|
||||||
|
new: 3
|
||||||
|
teaser: 'Pipeline component for rule-based token attribute assignment'
|
||||||
|
api_string_name: attribute_ruler
|
||||||
|
api_trainable: false
|
||||||
|
---
|
||||||
|
|
||||||
|
The attribute ruler lets you set token attributes for tokens identified by
|
||||||
|
[`Matcher` patterns](/usage/rule-based-matching#matcher). The attribute ruler is
|
||||||
|
typically used to handle exceptions for token attributes and to map values
|
||||||
|
between attributes such as mapping fine-grained POS tags to coarse-grained POS
|
||||||
|
tags.
|
||||||
|
|
||||||
|
## Config and implementation {#config}
|
||||||
|
|
||||||
|
The default config is defined by the pipeline component factory and describes
|
||||||
|
how the component should be configured. You can override its settings via the
|
||||||
|
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||||
|
[`config.cfg` for training](/usage/training#config).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> config = {
|
||||||
|
> "validation": True,
|
||||||
|
> "pattern_dicts": None,
|
||||||
|
> }
|
||||||
|
> nlp.add_pipe("attribute_ruler", config=config)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Setting | Type | Description | Default |
|
||||||
|
| --------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------- | ------- |
|
||||||
|
| `pattern_dicts` | `Iterable[dict]` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](#add) (`patterns`/`attrs`/`index`) to add as patterns. | `None` |
|
||||||
|
| `validation` | bool | Whether patterns should be validated, passed to `Matcher` as `validate`. | `False` |
|
||||||
|
|
||||||
|
```python
|
||||||
|
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/attributeruler.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## AttributeRuler.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
|
Initialize the attribute ruler. If pattern dicts are supplied here, they need to
|
||||||
|
be a list of dictionaries with `"patterns"`, `"attrs"`, and optional `"index"`
|
||||||
|
keys, e.g.:
|
||||||
|
|
||||||
|
```python
|
||||||
|
pattern_dicts = \[
|
||||||
|
{"patterns": \[\[{"TAG": "VB"}\]\], "attrs": {"POS": "VERB"}},
|
||||||
|
{"patterns": \[\[{"LOWER": "an"}\]\], "attrs": {"LEMMA": "a"}},
|
||||||
|
\]
|
||||||
|
```
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> # Construction via add_pipe
|
||||||
|
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| --------------- | ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `vocab` | `Vocab` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. |
|
||||||
|
| `name` | str | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `pattern_dicts` | `Iterable[Dict]]` | Optional patterns to load in on initialization. |
|
||||||
|
| `validate` | bool | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. |
|
||||||
|
|
||||||
|
## AttributeRuler.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
Apply the attribute ruler to a Doc, setting token attributes for tokens matched
|
||||||
|
by the provided patterns.
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ----- | ------------------------------------------------------------ |
|
||||||
|
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
|
||||||
|
| **RETURNS** | `Doc` | The modified `Doc` with added entities, if available. |
|
||||||
|
|
||||||
|
## AttributeRuler.add {#add tag="method"}
|
||||||
|
|
||||||
|
Add patterns to the attribute ruler. The patterns are a list of `Matcher`
|
||||||
|
patterns and the attributes are a dict of attributes to set on the matched
|
||||||
|
token. If the pattern matches a span of more than one token, the `index` can be
|
||||||
|
used to set the attributes for the token at that index in the span. The `index`
|
||||||
|
may be negative to index from the end of the span.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||||
|
> patterns = [[{"TAG": "VB"}]]
|
||||||
|
> attrs = {"POS": "VERB"}
|
||||||
|
> attribute_ruler.add(patterns=patterns, attrs=attrs)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------- | ---------------------- | ----------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| patterns | `Iterable[List[Dict]]` | A list of Matcher patterns. |
|
||||||
|
| attrs | dict | The attributes to assign to the target token in the matched span. |
|
||||||
|
| index | int | The index of the token in the matched span to modify. May be negative to index from the end of the span. Defaults to 0. |
|
||||||
|
|
||||||
|
## AttributeRuler.add_patterns {#add_patterns tag="method"}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||||
|
> pattern_dicts = \[
|
||||||
|
> {
|
||||||
|
> "patterns": \[\[{"TAG": "VB"}\]\],
|
||||||
|
> "attrs": {"POS": "VERB"}
|
||||||
|
> },
|
||||||
|
> {
|
||||||
|
> "patterns": \[\[{"LOWER": "two"}, {"LOWER": "apples"}\]\],
|
||||||
|
> "attrs": {"LEMMA": "apple"},
|
||||||
|
> "index": -1
|
||||||
|
> },
|
||||||
|
> \]
|
||||||
|
> attribute_ruler.add_patterns(pattern_dicts)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Add patterns from a list of pattern dicts with the keys as the arguments to
|
||||||
|
[`AttributeRuler.add`](#add).
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| --------------- | ----------------- | -------------------- |
|
||||||
|
| `pattern_dicts` | `Iterable[Dict]]` | The patterns to add. |
|
||||||
|
|
||||||
|
## AttributeRuler.patterns {#patterns tag="property"}
|
||||||
|
|
||||||
|
Get all patterns that have been added to the attribute ruler in the
|
||||||
|
`patterns_dict` format accepted by
|
||||||
|
[`AttributeRuler.add_patterns`](#add_patterns).
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ------------ | ------------------------------------------ |
|
||||||
|
| **RETURNS** | `List[dict]` | The patterns added to the attribute ruler. |
|
||||||
|
|
||||||
|
## AttributeRuler.load_from_tag_map {#load_from_tag_map tag="method"}
|
||||||
|
|
||||||
|
Load attribute ruler patterns from a tag map.
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| --------- | ---- | ------------------------------------------------------------------------------------------ |
|
||||||
|
| `tag_map` | dict | The tag map that maps fine-grained tags to coarse-grained tags and morphological features. |
|
||||||
|
|
||||||
|
## AttributeRuler.load_from_morph_rules {#load_from_morph_rules tag="method"}
|
||||||
|
|
||||||
|
Load attribute ruler patterns from morph rules.
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ------------- | ---- | -------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `morph_rules` | dict | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. |
|
||||||
|
|
||||||
|
## AttributeRuler.to_disk {#to_disk tag="method"}
|
||||||
|
|
||||||
|
Serialize the pipe to disk.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||||
|
> attribute_ruler.to_disk("/path/to/attribute_ruler")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
|
||||||
|
## AttributeRuler.from_disk {#from_disk tag="method"}
|
||||||
|
|
||||||
|
Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||||
|
> attribute_ruler.from_disk("/path/to/attribute_ruler")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | ---------------- | -------------------------------------------------------------------------- |
|
||||||
|
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
| **RETURNS** | `AttributeRuler` | The modified `AttributeRuler` object. |
|
||||||
|
|
||||||
|
## AttributeRuler.to_bytes {#to_bytes tag="method"}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||||
|
> attribute_ruler_bytes = attribute_ruler.to_bytes()
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
| **RETURNS** | bytes | The serialized form of the `AttributeRuler` object. |
|
||||||
|
|
||||||
|
## AttributeRuler.from_bytes {#from_bytes tag="method"}
|
||||||
|
|
||||||
|
Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> attribute_ruler_bytes = attribute_ruler.to_bytes()
|
||||||
|
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||||
|
> attribute_ruler.from_bytes(attribute_ruler_bytes)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | ---------------- | ------------------------------------------------------------------------- |
|
||||||
|
| `bytes_data` | bytes | The data to load from. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
| **RETURNS** | `AttributeRuler` | The `AttributeRuler` object. |
|
||||||
|
|
||||||
|
## Serialization fields {#serialization-fields}
|
||||||
|
|
||||||
|
During serialization, spaCy will export several data fields used to restore
|
||||||
|
different aspects of the object. If needed, you can exclude them from
|
||||||
|
serialization by passing in the string names via the `exclude` argument.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> data = attribute_ruler.to_disk("/path", exclude=["vocab"])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------- | -------------------------------------------------------------- |
|
||||||
|
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||||
|
| `patterns` | The Matcher patterns. You usually don't want to exclude this. |
|
||||||
|
| `attrs` | The attributes to set. You usually don't want to exclude this. |
|
||||||
|
| `indices` | The token indices. You usually don't want to exclude this. |
|
|
@ -85,6 +85,7 @@
|
||||||
{ "text": "Lemmatizer", "url": "/api/lemmatizer" },
|
{ "text": "Lemmatizer", "url": "/api/lemmatizer" },
|
||||||
{ "text": "Morphologizer", "url": "/api/morphologizer" },
|
{ "text": "Morphologizer", "url": "/api/morphologizer" },
|
||||||
{ "text": "Tagger", "url": "/api/tagger" },
|
{ "text": "Tagger", "url": "/api/tagger" },
|
||||||
|
{ "text": "AttributeRuler", "url": "/api/attributeruler" },
|
||||||
{ "text": "DependencyParser", "url": "/api/dependencyparser" },
|
{ "text": "DependencyParser", "url": "/api/dependencyparser" },
|
||||||
{ "text": "EntityRecognizer", "url": "/api/entityrecognizer" },
|
{ "text": "EntityRecognizer", "url": "/api/entityrecognizer" },
|
||||||
{ "text": "EntityRuler", "url": "/api/entityruler" },
|
{ "text": "EntityRuler", "url": "/api/entityruler" },
|
||||||
|
|
Loading…
Reference in New Issue
Block a user