Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2019-08-08 11:21:01 +02:00
commit f623b579d2
8 changed files with 73 additions and 6 deletions

View File

@ -6,7 +6,7 @@ spaCy is a library for advanced Natural Language Processing in Python and
Cython. It's built on the very latest research, and was designed from day one Cython. It's built on the very latest research, and was designed from day one
to be used in real products. spaCy comes with to be used in real products. spaCy comes with
[pre-trained statistical models](https://spacy.io/models) and word vectors, and [pre-trained statistical models](https://spacy.io/models) and word vectors, and
currently supports tokenization for **49+ languages**. It features currently supports tokenization for **50+ languages**. It features
state-of-the-art speed, convolutional **neural network models** for tagging, state-of-the-art speed, convolutional **neural network models** for tagging,
parsing and **named entity recognition** and easy **deep learning** integration. parsing and **named entity recognition** and easy **deep learning** integration.
It's commercial open-source software, released under the MIT license. It's commercial open-source software, released under the MIT license.

View File

@ -4,7 +4,7 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "2.1.7" __version__ = "2.1.8"
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
__uri__ = "https://spacy.io" __uri__ = "https://spacy.io"
__author__ = "Explosion AI" __author__ = "Explosion AI"

View File

@ -6,6 +6,7 @@ import requests
import os import os
import subprocess import subprocess
import sys import sys
import pkg_resources
from wasabi import Printer from wasabi import Printer
from .link import link from .link import link
@ -67,6 +68,16 @@ def download(model, direct=False, *pip_args):
"the model via its full package name: " "the model via its full package name: "
"nlp = spacy.load('{}')".format(model, model_name), "nlp = spacy.load('{}')".format(model, model_name),
) )
# If a model is downloaded and then loaded within the same process, our
# is_package check currently fails, because pkg_resources.working_set
# is not refreshed automatically (see #3923). We're trying to work
# around this here be requiring the package explicitly.
try:
pkg_resources.working_set.require(model_name)
except: # noqa: E722
# Maybe it's possible to remove this mostly worried about cross-
# platform and cross-Python copmpatibility here
pass
def get_json(url, desc): def get_json(url, desc):

View File

@ -26,7 +26,7 @@ class EntityRuler(object):
name = "entity_ruler" name = "entity_ruler"
def __init__(self, nlp, phrase_matcher_attr=None, **cfg): def __init__(self, nlp, phrase_matcher_attr=None, validate=False, **cfg):
"""Initialize the entitiy ruler. If patterns are supplied here, they """Initialize the entitiy ruler. If patterns are supplied here, they
need to be a list of dictionaries with a `"label"` and `"pattern"` need to be a list of dictionaries with a `"label"` and `"pattern"`
key. A pattern can either be a token pattern (list) or a phrase pattern key. A pattern can either be a token pattern (list) or a phrase pattern
@ -36,6 +36,8 @@ class EntityRuler(object):
and process phrase patterns. and process phrase patterns.
phrase_matcher_attr (int / unicode): Token attribute to match on, passed phrase_matcher_attr (int / unicode): Token attribute to match on, passed
to the internal PhraseMatcher as `attr` to the internal PhraseMatcher as `attr`
validate (bool): Whether patterns should be validated, passed to
Matcher and PhraseMatcher as `validate`
patterns (iterable): Optional patterns to load in. patterns (iterable): Optional patterns to load in.
overwrite_ents (bool): If existing entities are present, e.g. entities overwrite_ents (bool): If existing entities are present, e.g. entities
added by the model, overwrite them by matches if necessary. added by the model, overwrite them by matches if necessary.
@ -50,15 +52,15 @@ class EntityRuler(object):
self.overwrite = cfg.get("overwrite_ents", False) self.overwrite = cfg.get("overwrite_ents", False)
self.token_patterns = defaultdict(list) self.token_patterns = defaultdict(list)
self.phrase_patterns = defaultdict(list) self.phrase_patterns = defaultdict(list)
self.matcher = Matcher(nlp.vocab) self.matcher = Matcher(nlp.vocab, validate=validate)
if phrase_matcher_attr is not None: if phrase_matcher_attr is not None:
self.phrase_matcher_attr = phrase_matcher_attr self.phrase_matcher_attr = phrase_matcher_attr
self.phrase_matcher = PhraseMatcher( self.phrase_matcher = PhraseMatcher(
nlp.vocab, attr=self.phrase_matcher_attr nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
) )
else: else:
self.phrase_matcher_attr = None self.phrase_matcher_attr = None
self.phrase_matcher = PhraseMatcher(nlp.vocab) self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
patterns = cfg.get("patterns") patterns = cfg.get("patterns")
if patterns is not None: if patterns is not None:

View File

@ -5,6 +5,7 @@ import pytest
from spacy.tokens import Span from spacy.tokens import Span
from spacy.language import Language from spacy.language import Language
from spacy.pipeline import EntityRuler from spacy.pipeline import EntityRuler
from spacy.errors import MatchPatternError
@pytest.fixture @pytest.fixture
@ -127,3 +128,21 @@ def test_entity_ruler_serialize_phrase_matcher_attr_bytes(nlp, patterns):
assert len(new_ruler) == len(patterns) assert len(new_ruler) == len(patterns)
assert len(new_ruler.labels) == 4 assert len(new_ruler.labels) == 4
assert new_ruler.phrase_matcher_attr == "LOWER" assert new_ruler.phrase_matcher_attr == "LOWER"
def test_entity_ruler_validate(nlp):
ruler = EntityRuler(nlp)
validated_ruler = EntityRuler(nlp, validate=True)
valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]}
invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]}
# invalid pattern is added without errors without validate
ruler.add_patterns([invalid_pattern])
# valid pattern is added without errors with validate
validated_ruler.add_patterns([valid_pattern])
# invalid pattern raises error with validate
with pytest.raises(MatchPatternError):
validated_ruler.add_patterns([invalid_pattern])

View File

@ -35,6 +35,7 @@ be a token pattern (list) or a phrase pattern (string). For example:
| `nlp` | `Language` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. | | `nlp` | `Language` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. |
| `patterns` | iterable | Optional patterns to load in. | | `patterns` | iterable | Optional patterns to load in. |
| `phrase_matcher_attr` | int / unicode | Optional attr to pass to the internal [`PhraseMatcher`](/api/phtasematcher). defaults to `None` | | `phrase_matcher_attr` | int / unicode | Optional attr to pass to the internal [`PhraseMatcher`](/api/phtasematcher). defaults to `None` |
| `validate` | bool | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. |
| `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. | | `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. |
| `**cfg` | - | Other config parameters. If pipeline component is loaded as part of a model pipeline, this will include all keyword arguments passed to `spacy.load`. | | `**cfg` | - | Other config parameters. If pipeline component is loaded as part of a model pipeline, this will include all keyword arguments passed to `spacy.load`. |
| **RETURNS** | `EntityRuler` | The newly constructed object. | | **RETURNS** | `EntityRuler` | The newly constructed object. |

View File

@ -326,6 +326,29 @@ character, but no whitespace so you'll know it will be handled as one token.
[{"ORTH": "User"}, {"ORTH": "name"}, {"ORTH": ":"}, {}] [{"ORTH": "User"}, {"ORTH": "name"}, {"ORTH": ":"}, {}]
``` ```
#### Validating and debugging patterns {#pattern-validation new="2.1"}
The `Matcher` can validate patterns against a JSON schema with the option
`validate=True`. This is useful for debugging patterns during development, in
particular for catching unsupported attributes.
```python
### {executable="true"}
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab, validate=True)
# Add match ID "HelloWorld" with unsupported attribute CASEINSENSITIVE
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"CASEINSENSITIVE": "world"}]
matcher.add("HelloWorld", None, pattern)
# 🚨 Raises an error:
# MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld'
# Pattern 0:
# - Additional properties are not allowed ('CASEINSENSITIVE' was unexpected) [2]
```
### Adding on_match rules {#on_match} ### Adding on_match rules {#on_match}
To move on to a more realistic example, let's say you're working with a large To move on to a more realistic example, let's say you're working with a large
@ -901,6 +924,16 @@ doc = nlp(u"MyCorp Inc. is a company in the U.S.")
print([(ent.text, ent.label_) for ent in doc.ents]) print([(ent.text, ent.label_) for ent in doc.ents])
``` ```
#### Validating and debugging EntityRuler patterns {#entityruler-pattern-validation new="2.1.8"}
The `EntityRuler` can validate patterns against a JSON schema with the option
`validate=True`. See details under
[Validating and debugging patterns](#pattern-validation).
```python
ruler = EntityRuler(nlp, validate=True)
```
### Using pattern files {#entityruler-files} ### Using pattern files {#entityruler-files}
The [`to_disk`](/api/entityruler#to_disk) and The [`to_disk`](/api/entityruler#to_disk) and

View File

@ -127,6 +127,7 @@
{ "code": "is", "name": "Icelandic" }, { "code": "is", "name": "Icelandic" },
{ "code": "lt", "name": "Lithuanian" }, { "code": "lt", "name": "Lithuanian" },
{ "code": "lv", "name": "Latvian" }, { "code": "lv", "name": "Latvian" },
{ "code": "rs", "name": "Serbian" },
{ "code": "sk", "name": "Slovak" }, { "code": "sk", "name": "Slovak" },
{ "code": "sl", "name": "Slovenian" }, { "code": "sl", "name": "Slovenian" },
{ {