From 69aca7d8391e0bbc551fe588e1f3b06f1d68a3f2 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 7 Aug 2019 00:40:53 +0200 Subject: [PATCH 1/7] Add validate option to EntityRuler (#4089) * Add validate option to EntityRuler * Add validate to EntityRuler, passed to Matcher and PhraseMatcher * Add validate to usage and API docs * Update website/docs/usage/rule-based-matching.md Co-Authored-By: Ines Montani * Update website/docs/usage/rule-based-matching.md Co-Authored-By: Ines Montani --- spacy/pipeline/entityruler.py | 10 ++++--- spacy/tests/pipeline/test_entity_ruler.py | 19 ++++++++++++ website/docs/api/entityruler.md | 1 + website/docs/usage/rule-based-matching.md | 35 +++++++++++++++++++++++ 4 files changed, 61 insertions(+), 4 deletions(-) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 35b465ceb..23c8c91ba 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -26,7 +26,7 @@ class EntityRuler(object): name = "entity_ruler" - def __init__(self, nlp, phrase_matcher_attr=None, **cfg): + def __init__(self, nlp, phrase_matcher_attr=None, validate=False, **cfg): """Initialize the entitiy ruler. If patterns are supplied here, they need to be a list of dictionaries with a `"label"` and `"pattern"` key. A pattern can either be a token pattern (list) or a phrase pattern @@ -36,6 +36,8 @@ class EntityRuler(object): and process phrase patterns. phrase_matcher_attr (int / unicode): Token attribute to match on, passed to the internal PhraseMatcher as `attr` + validate (bool): Whether patterns should be validated, passed to + Matcher and PhraseMatcher as `validate` patterns (iterable): Optional patterns to load in. overwrite_ents (bool): If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. @@ -50,15 +52,15 @@ class EntityRuler(object): self.overwrite = cfg.get("overwrite_ents", False) self.token_patterns = defaultdict(list) self.phrase_patterns = defaultdict(list) - self.matcher = Matcher(nlp.vocab) + self.matcher = Matcher(nlp.vocab, validate=validate) if phrase_matcher_attr is not None: self.phrase_matcher_attr = phrase_matcher_attr self.phrase_matcher = PhraseMatcher( - nlp.vocab, attr=self.phrase_matcher_attr + nlp.vocab, attr=self.phrase_matcher_attr, validate=validate ) else: self.phrase_matcher_attr = None - self.phrase_matcher = PhraseMatcher(nlp.vocab) + self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate) self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) patterns = cfg.get("patterns") if patterns is not None: diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index 5ab1a3af0..57e980ec3 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -5,6 +5,7 @@ import pytest from spacy.tokens import Span from spacy.language import Language from spacy.pipeline import EntityRuler +from spacy.errors import MatchPatternError @pytest.fixture @@ -127,3 +128,21 @@ def test_entity_ruler_serialize_phrase_matcher_attr_bytes(nlp, patterns): assert len(new_ruler) == len(patterns) assert len(new_ruler.labels) == 4 assert new_ruler.phrase_matcher_attr == "LOWER" + + +def test_entity_ruler_validate(nlp): + ruler = EntityRuler(nlp) + validated_ruler = EntityRuler(nlp, validate=True) + + valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]} + invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]} + + # invalid pattern is added without errors without validate + ruler.add_patterns([invalid_pattern]) + + # valid pattern is added without errors with validate + validated_ruler.add_patterns([valid_pattern]) + + # invalid pattern raises error with validate + with pytest.raises(MatchPatternError): + validated_ruler.add_patterns([invalid_pattern]) diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index 4424bd254..46dbb3d1d 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -35,6 +35,7 @@ be a token pattern (list) or a phrase pattern (string). For example: | `nlp` | `Language` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. | | `patterns` | iterable | Optional patterns to load in. | | `phrase_matcher_attr` | int / unicode | Optional attr to pass to the internal [`PhraseMatcher`](/api/phtasematcher). defaults to `None` | +| `validate` | bool | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. | | `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. | | `**cfg` | - | Other config parameters. If pipeline component is loaded as part of a model pipeline, this will include all keyword arguments passed to `spacy.load`. | | **RETURNS** | `EntityRuler` | The newly constructed object. | diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 16db191d1..80125d933 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -326,6 +326,31 @@ character, but no whitespace – so you'll know it will be handled as one token. [{"ORTH": "User"}, {"ORTH": "name"}, {"ORTH": ":"}, {}] ``` +#### Validating and debugging patterns {#pattern-validation new="2.1"} + +The `Matcher` can validate patterns against a JSON schema with the option +`validate=True`. This is useful for debugging patterns during development, in +particular for catching unsupported attributes. + +```python +### {executable="true"} +import spacy +from spacy.matcher import Matcher + +nlp = spacy.load("en_core_web_sm") +matcher = Matcher(nlp.vocab, validate=True) +# Add match ID "HelloWorld" with unsupported attribute CASEINSENSITIVE +pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"CASEINSENSITIVE": "world"}] +matcher.add("HelloWorld", None, pattern) + +# Raises an error: +# +# spacy.errors.MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld' +# Pattern 0: +# - Additional properties are not allowed ('CASEINSENSITIVE' was unexpected) [2] + +``` + ### Adding on_match rules {#on_match} To move on to a more realistic example, let's say you're working with a large @@ -901,6 +926,16 @@ doc = nlp(u"MyCorp Inc. is a company in the U.S.") print([(ent.text, ent.label_) for ent in doc.ents]) ``` +#### Validating and debugging EntityRuler patterns {#entityruler-pattern-validation} + +The `EntityRuler` can validate patterns against a JSON schema with the option +`validate=True`. See details under [Validating and debugging +patterns](#pattern-validation). + +```python +ruler = EntityRuler(nlp, validate=True) +``` + ### Using pattern files {#entityruler-files} The [`to_disk`](/api/entityruler#to_disk) and From 8b4a0fabbb65fa701261bbb5136d8cd15f65a560 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 7 Aug 2019 00:46:47 +0200 Subject: [PATCH 2/7] Adjust docs example [ci skip] --- website/docs/usage/rule-based-matching.md | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 80125d933..679b854a1 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -342,10 +342,8 @@ matcher = Matcher(nlp.vocab, validate=True) # Add match ID "HelloWorld" with unsupported attribute CASEINSENSITIVE pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"CASEINSENSITIVE": "world"}] matcher.add("HelloWorld", None, pattern) - -# Raises an error: -# -# spacy.errors.MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld' +# 🚨 Raises an error: +# MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld' # Pattern 0: # - Additional properties are not allowed ('CASEINSENSITIVE' was unexpected) [2] @@ -926,11 +924,11 @@ doc = nlp(u"MyCorp Inc. is a company in the U.S.") print([(ent.text, ent.label_) for ent in doc.ents]) ``` -#### Validating and debugging EntityRuler patterns {#entityruler-pattern-validation} +#### Validating and debugging EntityRuler patterns {#entityruler-pattern-validation new="2.1.8"} The `EntityRuler` can validate patterns against a JSON schema with the option -`validate=True`. See details under [Validating and debugging -patterns](#pattern-validation). +`validate=True`. See details under +[Validating and debugging patterns](#pattern-validation). ```python ruler = EntityRuler(nlp, validate=True) From 6bec24cdd09c8168d2ce8667e376bb1f0e320c07 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 7 Aug 2019 13:18:11 +0200 Subject: [PATCH 3/7] Require downloaded model in pkg_resources (#4090) --- spacy/cli/download.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 1075b0c60..8a993178a 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -6,6 +6,7 @@ import requests import os import subprocess import sys +import pkg_resources from wasabi import Printer from .link import link @@ -67,6 +68,16 @@ def download(model, direct=False, *pip_args): "the model via its full package name: " "nlp = spacy.load('{}')".format(model, model_name), ) + # If a model is downloaded and then loaded within the same process, our + # is_package check currently fails, because pkg_resources.working_set + # is not refreshed automatically (see #3923). We're trying to work + # around this here be requiring the package explicitly. + try: + pkg_resources.working_set.require(model_name) + except: # noqa: E722 + # Maybe it's possible to remove this – mostly worried about cross- + # platform and cross-Python copmpatibility here + pass def get_json(url, desc): From 1dc28a9ecb6fb43c23acc8ab45a5d93a3f51315a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 7 Aug 2019 13:38:12 +0200 Subject: [PATCH 4/7] Update Binder version [ci skip] --- website/meta/site.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/site.json b/website/meta/site.json index 1820ff5df..7ec146cf5 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -29,7 +29,7 @@ "spacyVersion": "2.1", "binderUrl": "ines/spacy-io-binder", "binderBranch": "live", - "binderVersion": "2.1.3", + "binderVersion": "2.1.7", "sections": [ { "id": "usage", "title": "Usage Documentation", "theme": "blue" }, { "id": "models", "title": "Models Documentation", "theme": "blue" }, From 3e60afacf99e46ff30299937849251734c52676a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 7 Aug 2019 13:38:25 +0200 Subject: [PATCH 5/7] Add Serbian to languages [ci skip] --- website/meta/languages.json | 1 + 1 file changed, 1 insertion(+) diff --git a/website/meta/languages.json b/website/meta/languages.json index 549bd058b..8f91698ec 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -127,6 +127,7 @@ { "code": "is", "name": "Icelandic" }, { "code": "lt", "name": "Lithuanian" }, { "code": "lv", "name": "Latvian" }, + { "code": "rs", "name": "Serbian" }, { "code": "sk", "name": "Slovak" }, { "code": "sl", "name": "Slovenian" }, { From 36ac04493772144299756360450374bc1d4d70ed Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 7 Aug 2019 13:38:59 +0200 Subject: [PATCH 6/7] Update README.md [ci skip] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f9f484bae..248129535 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ spaCy is a library for advanced Natural Language Processing in Python and Cython. It's built on the very latest research, and was designed from day one to be used in real products. spaCy comes with [pre-trained statistical models](https://spacy.io/models) and word vectors, and -currently supports tokenization for **49+ languages**. It features +currently supports tokenization for **50+ languages**. It features state-of-the-art speed, convolutional **neural network models** for tagging, parsing and **named entity recognition** and easy **deep learning** integration. It's commercial open-source software, released under the MIT license. From 04113a844d9042f04c1fa0bc5830f11355b9b526 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 7 Aug 2019 13:53:58 +0200 Subject: [PATCH 7/7] Set version to v2.1.8 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index ae65922c4..9587c9071 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,7 +4,7 @@ # fmt: off __title__ = "spacy" -__version__ = "2.1.7" +__version__ = "2.1.8" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI"