Merge pull request #5893 from explosion/feature/validate-arg

This commit is contained in:
Ines Montani 2020-08-07 15:47:20 +02:00 committed by GitHub
commit 6f3649923c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 23 additions and 17 deletions

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a22,<8.0.0a30", "thinc>=8.0.0a23,<8.0.0a30",
"blis>=0.4.0,<0.5.0", "blis>=0.4.0,<0.5.0",
"pytokenizations", "pytokenizations",
"smart_open>=2.0.0,<3.0.0" "smart_open>=2.0.0,<3.0.0"

View File

@ -1,7 +1,7 @@
# Our libraries # Our libraries
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a22,<8.0.0a30 thinc>=8.0.0a23,<8.0.0a30
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1 ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0

View File

@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a22,<8.0.0a30 thinc>=8.0.0a23,<8.0.0a30
install_requires = install_requires =
# Our libraries # Our libraries
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a22,<8.0.0a30 thinc>=8.0.0a23,<8.0.0a30
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
wasabi>=0.7.1,<1.1.0 wasabi>=0.7.1,<1.1.0
srsly>=2.1.0,<3.0.0 srsly>=2.1.0,<3.0.0

View File

@ -17,13 +17,18 @@ MatcherPatternType = List[Dict[Union[int, str], Any]]
AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]] AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]]
@Language.factory("attribute_ruler") @Language.factory(
"attribute_ruler", default_config={"pattern_dicts": None, "validate": False}
)
def make_attribute_ruler( def make_attribute_ruler(
nlp: Language, nlp: Language,
name: str, name: str,
pattern_dicts: Optional[Iterable[AttributeRulerPatternType]] = None, pattern_dicts: Optional[Iterable[AttributeRulerPatternType]],
validate: bool,
): ):
return AttributeRuler(nlp.vocab, name, pattern_dicts=pattern_dicts) return AttributeRuler(
nlp.vocab, name, pattern_dicts=pattern_dicts, validate=validate
)
class AttributeRuler(Pipe): class AttributeRuler(Pipe):
@ -39,6 +44,7 @@ class AttributeRuler(Pipe):
name: str = "attribute_ruler", name: str = "attribute_ruler",
*, *,
pattern_dicts: Optional[Iterable[AttributeRulerPatternType]] = None, pattern_dicts: Optional[Iterable[AttributeRulerPatternType]] = None,
validate: bool = False,
) -> None: ) -> None:
"""Initialize the AttributeRuler. """Initialize the AttributeRuler.
@ -54,7 +60,7 @@ class AttributeRuler(Pipe):
""" """
self.name = name self.name = name
self.vocab = vocab self.vocab = vocab
self.matcher = Matcher(self.vocab) self.matcher = Matcher(self.vocab, validate=validate)
self.attrs = [] self.attrs = []
self._attrs_unnormed = [] # store for reference self._attrs_unnormed = [] # store for reference
self.indices = [] self.indices = []

View File

@ -20,7 +20,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
assigns=["doc.ents", "token.ent_type", "token.ent_iob"], assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
default_config={ default_config={
"phrase_matcher_attr": None, "phrase_matcher_attr": None,
"validation": False, "validate": False,
"overwrite_ents": False, "overwrite_ents": False,
"ent_id_sep": DEFAULT_ENT_ID_SEP, "ent_id_sep": DEFAULT_ENT_ID_SEP,
}, },
@ -31,7 +31,7 @@ def make_entity_ruler(
nlp: Language, nlp: Language,
name: str, name: str,
phrase_matcher_attr: Optional[Union[int, str]], phrase_matcher_attr: Optional[Union[int, str]],
validation: bool, validate: bool,
overwrite_ents: bool, overwrite_ents: bool,
ent_id_sep: str, ent_id_sep: str,
): ):
@ -39,7 +39,7 @@ def make_entity_ruler(
nlp, nlp,
name, name,
phrase_matcher_attr=phrase_matcher_attr, phrase_matcher_attr=phrase_matcher_attr,
validate=validation, validate=validate,
overwrite_ents=overwrite_ents, overwrite_ents=overwrite_ents,
ent_id_sep=ent_id_sep, ent_id_sep=ent_id_sep,
) )

View File

@ -25,8 +25,8 @@ how the component should be configured. You can override its settings via the
> >
> ```python > ```python
> config = { > config = {
> "validation": True,
> "pattern_dicts": None, > "pattern_dicts": None,
> "validate": True,
> } > }
> nlp.add_pipe("attribute_ruler", config=config) > nlp.add_pipe("attribute_ruler", config=config)
> ``` > ```
@ -34,7 +34,7 @@ how the component should be configured. You can override its settings via the
| Setting | Type | Description | Default | | Setting | Type | Description | Default |
| --------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------- | ------- | | --------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------- | ------- |
| `pattern_dicts` | `Iterable[dict]` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](#add) (`patterns`/`attrs`/`index`) to add as patterns. | `None` | | `pattern_dicts` | `Iterable[dict]` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](#add) (`patterns`/`attrs`/`index`) to add as patterns. | `None` |
| `validation` | bool | Whether patterns should be validated, passed to `Matcher` as `validate`. | `False` | | `validate` | bool | Whether patterns should be validated (passed to the `Matcher`). | `False` |
```python ```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/attributeruler.py https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/attributeruler.py
@ -65,8 +65,8 @@ pattern_dicts = \[
| `vocab` | `Vocab` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. | | `vocab` | `Vocab` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. |
| `name` | str | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. | | `name` | str | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. |
| _keyword-only_ | | | | _keyword-only_ | | |
| `pattern_dicts` | `Iterable[Dict]]` | Optional patterns to load in on initialization. | | `pattern_dicts` | `Iterable[Dict]]` | Optional patterns to load in on initialization. Defaults to `None`. |
| `validate` | bool | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. | | `validate` | bool | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. |
## AttributeRuler.\_\_call\_\_ {#call tag="method"} ## AttributeRuler.\_\_call\_\_ {#call tag="method"}

View File

@ -27,7 +27,7 @@ how the component should be configured. You can override its settings via the
> ```python > ```python
> config = { > config = {
> "phrase_matcher_attr": None, > "phrase_matcher_attr": None,
> "validation": True, > "validate": True,
> "overwrite_ents": False, > "overwrite_ents": False,
> "ent_id_sep": "||", > "ent_id_sep": "||",
> } > }
@ -37,7 +37,7 @@ how the component should be configured. You can override its settings via the
| Setting | Type | Description | Default | | Setting | Type | Description | Default |
| --------------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------- | ------- | | --------------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
| `phrase_matcher_attr` | str | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. | `None` | | `phrase_matcher_attr` | str | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. | `None` |
| `validation` | bool | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. | `False` | | `validate` | bool | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). | `False` |
| `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. | `False` | | `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. | `False` |
| `ent_id_sep` | str | Separator used internally for entity IDs. | `"||"` | | `ent_id_sep` | str | Separator used internally for entity IDs. | `"||"` |