Improve token pattern checking without validation (#4105)

* Fix typo in rule-based matching docs

* Improve token pattern checking without validation

Add more detailed token pattern checks without full JSON pattern validation and
provide more detailed error messages.

Addresses #4070 (also related: #4063, #4100).

* Check whether top-level attributes in patterns and attr for PhraseMatcher are
  in token pattern schema

* Check whether attribute value types are supported in general (as opposed to
  per attribute with full validation)

* Report various internal error types (OverflowError, AttributeError, KeyError)
  as ValueError with standard error messages

* Check for tagger/parser in PhraseMatcher pipeline for attributes TAG, POS,
  LEMMA, and DEP

* Add error messages with relevant details on how to use validate=True or nlp()
  instead of nlp.make_doc()

* Support attr=TEXT for PhraseMatcher

* Add NORM to schema

* Expand tests for pattern validation, Matcher, PhraseMatcher, and EntityRuler

* Remove unnecessary .keys()

* Rephrase error messages

* Add another type check to Matcher

Add another type check to Matcher for more understandable error messages
in some rare cases.

* Support phrase_matcher_attr=TEXT for EntityRuler

* Don't use spacy.errors in examples and bin scripts

* Fix error code

* Auto-format

Also try get Azure pipelines to finally start a build :(

* Update errors.py


Co-authored-by: Ines Montani <ines@ines.io>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
adrianeboyd 2019-08-21 14:00:37 +02:00 committed by Ines Montani
parent 3134a9b6e0
commit 8fe7bdd0fa
15 changed files with 162 additions and 58 deletions

View File

@ -8,8 +8,6 @@ from spacy.kb import KnowledgeBase
import csv import csv
import datetime import datetime
from spacy import Errors
def create_kb( def create_kb(
nlp, nlp,
@ -33,7 +31,10 @@ def create_kb(
input_dim = nlp.vocab.vectors_length input_dim = nlp.vocab.vectors_length
print("Loaded pre-trained vectors of size %s" % input_dim) print("Loaded pre-trained vectors of size %s" % input_dim)
else: else:
raise ValueError(Errors.E155) raise ValueError(
"The `nlp` object should have access to pre-trained word vectors, "
" cf. https://spacy.io/usage/models#languages."
)
# disable this part of the pipeline when rerunning the KB generation from preprocessed files # disable this part of the pipeline when rerunning the KB generation from preprocessed files
if read_raw_data: if read_raw_data:

View File

@ -73,7 +73,10 @@ def main(
# check the length of the nlp vectors # check the length of the nlp vectors
if "vectors" not in nlp.meta or not nlp.vocab.vectors.size: if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
raise ValueError(Errors.E155) raise ValueError(
"The `nlp` object should have access to pre-trained word vectors, "
" cf. https://spacy.io/usage/models#languages."
)
# STEP 2: create prior probabilities from WP # STEP 2: create prior probabilities from WP
print() print()

View File

@ -19,8 +19,6 @@ from bin.wiki_entity_linking import training_set_creator
import spacy import spacy
from spacy.kb import KnowledgeBase from spacy.kb import KnowledgeBase
from spacy import Errors
from spacy.util import minibatch, compounding from spacy.util import minibatch, compounding
@ -68,7 +66,7 @@ def main(
# check that there is a NER component in the pipeline # check that there is a NER component in the pipeline
if "ner" not in nlp.pipe_names: if "ner" not in nlp.pipe_names:
raise ValueError(Errors.E152) raise ValueError("The `nlp` object should have a pre-trained `ner` component.")
# STEP 2 : read the KB # STEP 2 : read the KB
print() print()
@ -82,7 +80,10 @@ def main(
print(now(), "STEP 3: reading training dataset from", loc_training) print(now(), "STEP 3: reading training dataset from", loc_training)
else: else:
if not wp_xml: if not wp_xml:
raise ValueError(Errors.E153) raise ValueError(
"Either provide a path to a preprocessed training directory, "
"or to the original Wikipedia XML dump."
)
if output_dir: if output_dir:
loc_training = output_dir / "training_data" loc_training = output_dir / "training_data"

View File

@ -17,12 +17,10 @@ import plac
from pathlib import Path from pathlib import Path
from spacy.vocab import Vocab from spacy.vocab import Vocab
import spacy import spacy
from spacy.kb import KnowledgeBase from spacy.kb import KnowledgeBase
from bin.wiki_entity_linking.train_descriptions import EntityEncoder from bin.wiki_entity_linking.train_descriptions import EntityEncoder
from spacy import Errors
# Q2146908 (Russ Cochran): American golfer # Q2146908 (Russ Cochran): American golfer
@ -45,7 +43,7 @@ def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
If an output_dir is provided, the KB will be stored there in a file 'kb'. If an output_dir is provided, the KB will be stored there in a file 'kb'.
When providing an nlp model, the updated vocab will also be written to a directory in the output_dir.""" When providing an nlp model, the updated vocab will also be written to a directory in the output_dir."""
if model is None and vocab_path is None: if model is None and vocab_path is None:
raise ValueError(Errors.E154) raise ValueError("Either the `nlp` model or the `vocab` should be specified.")
if model is not None: if model is not None:
nlp = spacy.load(model) # load existing spaCy model nlp = spacy.load(model) # load existing spaCy model

View File

@ -22,8 +22,6 @@ from spacy.vocab import Vocab
import spacy import spacy
from spacy.kb import KnowledgeBase from spacy.kb import KnowledgeBase
from spacy import Errors
from spacy.tokens import Span from spacy.tokens import Span
from spacy.util import minibatch, compounding from spacy.util import minibatch, compounding

View File

@ -128,7 +128,7 @@ class DependencyRenderer(object):
""" """
if start < 0 or end < 0: if start < 0 or end < 0:
error_args = dict(start=start, end=end, label=label, dir=direction) error_args = dict(start=start, end=end, label=label, dir=direction)
raise ValueError(Errors.E156.format(**error_args)) raise ValueError(Errors.E157.format(**error_args))
level = self.levels.index(end - start) + 1 level = self.levels.index(end - start) + 1
x_start = self.offset_x + start * self.distance + self.arrow_spacing x_start = self.offset_x + start * self.distance + self.arrow_spacing
if self.direction == "rtl": if self.direction == "rtl":

View File

@ -431,13 +431,24 @@ class Errors(object):
"same, but found '{nlp}' and '{vocab}' respectively.") "same, but found '{nlp}' and '{vocab}' respectively.")
E151 = ("Trying to call nlp.update without required annotation types. " E151 = ("Trying to call nlp.update without required annotation types. "
"Expected top-level keys: {exp}. Got: {unexp}.") "Expected top-level keys: {exp}. Got: {unexp}.")
E152 = ("The `nlp` object should have a pre-trained `ner` component.") E152 = ("The attribute {attr} is not supported for token patterns. "
E153 = ("Either provide a path to a preprocessed training directory, " "Please use the option validate=True with Matcher, PhraseMatcher, "
"or to the original Wikipedia XML dump.") "or EntityRuler for more details.")
E154 = ("Either the `nlp` model or the `vocab` should be specified.") E153 = ("The value type {vtype} is not supported for token patterns. "
E155 = ("The `nlp` object should have access to pre-trained word vectors, " "Please use the option validate=True with Matcher, PhraseMatcher, "
" cf. https://spacy.io/usage/models#languages.") "or EntityRuler for more details.")
E156 = ("Can't render negative values for dependency arc start or end. " E154 = ("One of the attributes or values is not supported for token "
"patterns. Please use the option validate=True with Matcher, "
"PhraseMatcher, or EntityRuler for more details.")
E155 = ("The pipeline needs to include a tagger in order to use "
"PhraseMatcher with the attributes POS, TAG, or LEMMA. Try using "
"nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
"list(nlp.tokenizer.pipe()).")
E156 = ("The pipeline needs to include a parser in order to use "
"PhraseMatcher with the attribute DEP. Try using "
"nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
"list(nlp.tokenizer.pipe()).")
E157 = ("Can't render negative values for dependency arc start or end. "
"Make sure that you're passing in absolute token indices, not " "Make sure that you're passing in absolute token indices, not "
"relative token offsets.\nstart: {start}, end: {end}, label: " "relative token offsets.\nstart: {start}, end: {end}, label: "
"{label}, direction: {dir}") "{label}, direction: {dir}")

View File

@ -102,6 +102,10 @@ TOKEN_PATTERN_SCHEMA = {
"title": "Entity label of single token", "title": "Entity label of single token",
"$ref": "#/definitions/string_value", "$ref": "#/definitions/string_value",
}, },
"NORM": {
"title": "Normalized form of the token text",
"$ref": "#/definitions/string_value",
},
"LENGTH": { "LENGTH": {
"title": "Token character length", "title": "Token character length",
"$ref": "#/definitions/integer_value", "$ref": "#/definitions/integer_value",

View File

@ -112,9 +112,12 @@ cdef class Matcher:
raise MatchPatternError(key, errors) raise MatchPatternError(key, errors)
key = self._normalize_key(key) key = self._normalize_key(key)
for pattern in patterns: for pattern in patterns:
try:
specs = _preprocess_pattern(pattern, self.vocab.strings, specs = _preprocess_pattern(pattern, self.vocab.strings,
self._extensions, self._extra_predicates) self._extensions, self._extra_predicates)
self.patterns.push_back(init_pattern(self.mem, key, specs)) self.patterns.push_back(init_pattern(self.mem, key, specs))
except OverflowError, AttributeError:
raise ValueError(Errors.E154.format())
self._patterns.setdefault(key, []) self._patterns.setdefault(key, [])
self._callbacks[key] = on_match self._callbacks[key] = on_match
self._patterns[key].extend(patterns) self._patterns[key].extend(patterns)
@ -568,6 +571,8 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
# Signifier for 'any token' # Signifier for 'any token'
tokens.append((ONE, [(NULL_ATTR, 0)], [], [])) tokens.append((ONE, [(NULL_ATTR, 0)], [], []))
continue continue
if not isinstance(spec, dict):
raise ValueError(Errors.E154.format())
ops = _get_operators(spec) ops = _get_operators(spec)
attr_values = _get_attr_values(spec, string_store) attr_values = _get_attr_values(spec, string_store)
extensions = _get_extensions(spec, string_store, extensions_table) extensions = _get_extensions(spec, string_store, extensions_table)
@ -581,21 +586,29 @@ def _get_attr_values(spec, string_store):
attr_values = [] attr_values = []
for attr, value in spec.items(): for attr, value in spec.items():
if isinstance(attr, basestring): if isinstance(attr, basestring):
attr = attr.upper()
if attr == '_': if attr == '_':
continue continue
elif attr.upper() == "OP": elif attr == "OP":
continue continue
if attr.upper() == "TEXT": if attr == "TEXT":
attr = "ORTH" attr = "ORTH"
attr = IDS.get(attr.upper()) if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]:
raise ValueError(Errors.E152.format(attr=attr))
attr = IDS.get(attr)
if isinstance(value, basestring): if isinstance(value, basestring):
value = string_store.add(value) value = string_store.add(value)
elif isinstance(value, bool): elif isinstance(value, bool):
value = int(value) value = int(value)
elif isinstance(value, dict): elif isinstance(value, dict):
continue continue
else:
raise ValueError(Errors.E153.format(vtype=type(value).__name__))
if attr is not None: if attr is not None:
attr_values.append((attr, value)) attr_values.append((attr, value))
else:
# should be caught above using TOKEN_PATTERN_SCHEMA
raise ValueError(Errors.E152.format(attr=attr))
return attr_values return attr_values
@ -755,11 +768,13 @@ def _get_operators(spec):
return lookup[spec["OP"]] return lookup[spec["OP"]]
else: else:
keys = ", ".join(lookup.keys()) keys = ", ".join(lookup.keys())
raise KeyError(Errors.E011.format(op=spec["OP"], opts=keys)) raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
def _get_extensions(spec, string_store, name2index): def _get_extensions(spec, string_store, name2index):
attr_values = [] attr_values = []
if not isinstance(spec.get("_", {}), dict):
raise ValueError(Errors.E154.format())
for name, value in spec.get("_", {}).items(): for name, value in spec.get("_", {}).items():
if isinstance(value, dict): if isinstance(value, dict):
# Handle predicates (e.g. "IN", in the extra_predicates, not here. # Handle predicates (e.g. "IN", in the extra_predicates, not here.

View File

@ -12,6 +12,7 @@ from ..vocab cimport Vocab
from ..tokens.doc cimport Doc, get_token_attr from ..tokens.doc cimport Doc, get_token_attr
from ..typedefs cimport attr_t, hash_t from ..typedefs cimport attr_t, hash_t
from ._schemas import TOKEN_PATTERN_SCHEMA
from ..errors import Errors, Warnings, deprecation_warning, user_warning from ..errors import Errors, Warnings, deprecation_warning, user_warning
from ..attrs import FLAG61 as U_ENT from ..attrs import FLAG61 as U_ENT
from ..attrs import FLAG60 as B2_ENT from ..attrs import FLAG60 as B2_ENT
@ -62,6 +63,11 @@ cdef class PhraseMatcher:
if isinstance(attr, long): if isinstance(attr, long):
self.attr = attr self.attr = attr
else: else:
attr = attr.upper()
if attr == "TEXT":
attr = "ORTH"
if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]:
raise ValueError(Errors.E152.format(attr=attr))
self.attr = self.vocab.strings[attr] self.attr = self.vocab.strings[attr]
self.phrase_ids = PreshMap() self.phrase_ids = PreshMap()
abstract_patterns = [ abstract_patterns = [
@ -123,6 +129,10 @@ cdef class PhraseMatcher:
length = doc.length length = doc.length
if length == 0: if length == 0:
continue continue
if self.attr in (POS, TAG, LEMMA) and not doc.is_tagged:
raise ValueError(Errors.E155.format())
if self.attr == DEP and not doc.is_parsed:
raise ValueError(Errors.E156.format())
if self._validate and (doc.is_tagged or doc.is_parsed) \ if self._validate and (doc.is_tagged or doc.is_parsed) \
and self.attr not in (DEP, POS, TAG, LEMMA): and self.attr not in (DEP, POS, TAG, LEMMA):
string_attr = self.vocab.strings[self.attr] string_attr = self.vocab.strings[self.attr]

View File

@ -54,6 +54,8 @@ class EntityRuler(object):
self.phrase_patterns = defaultdict(list) self.phrase_patterns = defaultdict(list)
self.matcher = Matcher(nlp.vocab, validate=validate) self.matcher = Matcher(nlp.vocab, validate=validate)
if phrase_matcher_attr is not None: if phrase_matcher_attr is not None:
if phrase_matcher_attr.upper() == "TEXT":
phrase_matcher_attr = "ORTH"
self.phrase_matcher_attr = phrase_matcher_attr self.phrase_matcher_attr = phrase_matcher_attr
self.phrase_matcher = PhraseMatcher( self.phrase_matcher = PhraseMatcher(
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate nlp.vocab, attr=self.phrase_matcher_attr, validate=validate

View File

@ -7,6 +7,36 @@ from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA
from spacy.errors import MatchPatternError from spacy.errors import MatchPatternError
from spacy.util import get_json_validator, validate_json from spacy.util import get_json_validator, validate_json
# (pattern, num errors with validation, num errors identified with minimal
# checks)
TEST_PATTERNS = [
# Bad patterns flagged in all cases
([{"XX": "foo"}], 1, 1),
([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2, 1),
([{"IS_ALPHA": {"==": True}}, {"LIKE_NUM": None}], 2, 1),
([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
([{"IS_DIGIT": -1}], 1, 1),
([{"ORTH": -1}], 1, 1),
([{"_": "foo"}], 1, 1),
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
([1, 2, 3], 3, 1),
# Bad patterns flagged outside of Matcher
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1, 0),
# Bad patterns not flagged with minimal checks
([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2, 0),
([{"LENGTH": {"VALUE": 5}}], 1, 0),
([{"TEXT": {"VALUE": "foo"}}], 1, 0),
# Good patterns
([{"TEXT": "foo"}, {"LOWER": "bar"}], 0, 0),
([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0, 0),
([{"LIKE_NUM": True, "LENGTH": {">=": 5}}], 0, 0),
([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0, 0),
([{"NORM": "a"}, {"POS": {"IN": ["NOUN"]}}], 0, 0),
([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0, 0),
]
XFAIL_TEST_PATTERNS = [([{"orth": "foo"}], 0, 0)]
@pytest.fixture @pytest.fixture
def validator(): def validator():
@ -22,27 +52,24 @@ def test_matcher_pattern_validation(en_vocab, pattern):
matcher.add("TEST", None, pattern) matcher.add("TEST", None, pattern)
@pytest.mark.parametrize( @pytest.mark.parametrize("pattern,n_errors,_", TEST_PATTERNS)
"pattern,n_errors", def test_pattern_validation(validator, pattern, n_errors, _):
[
# Bad patterns
([{"XX": "foo"}], 1),
([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2),
([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2),
([{"IS_ALPHA": {"==": True}}, {"LIKE_NUM": None}], 2),
([{"TEXT": {"VALUE": "foo"}}], 1),
([{"LENGTH": {"VALUE": 5}}], 1),
([{"_": "foo"}], 1),
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1),
([{"IS_PUNCT": True, "OP": "$"}], 1),
# Good patterns
([{"TEXT": "foo"}, {"LOWER": "bar"}], 0),
([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0),
([{"LIKE_NUM": True, "LENGTH": {">=": 5}}], 0),
([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0),
([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0),
],
)
def test_pattern_validation(validator, pattern, n_errors):
errors = validate_json(pattern, validator) errors = validate_json(pattern, validator)
assert len(errors) == n_errors assert len(errors) == n_errors
@pytest.mark.xfail
@pytest.mark.parametrize("pattern,n_errors,_", XFAIL_TEST_PATTERNS)
def test_xfail_pattern_validation(validator, pattern, n_errors, _):
errors = validate_json(pattern, validator)
assert len(errors) == n_errors
@pytest.mark.parametrize("pattern,n_errors,n_min_errors", TEST_PATTERNS)
def test_minimal_pattern_validation(en_vocab, pattern, n_errors, n_min_errors):
matcher = Matcher(en_vocab)
if n_min_errors > 0:
with pytest.raises(ValueError):
matcher.add("TEST", None, pattern)
elif n_errors == 0:
matcher.add("TEST", None, pattern)

View File

@ -99,3 +99,36 @@ def test_phrase_matcher_validation(en_vocab):
with pytest.warns(None) as record: with pytest.warns(None) as record:
matcher.add("TEST4", None, doc2) matcher.add("TEST4", None, doc2)
assert not record.list assert not record.list
def test_attr_validation(en_vocab):
with pytest.raises(ValueError):
PhraseMatcher(en_vocab, attr="UNSUPPORTED")
def test_attr_pipeline_checks(en_vocab):
doc1 = Doc(en_vocab, words=["Test"])
doc1.is_parsed = True
doc2 = Doc(en_vocab, words=["Test"])
doc2.is_tagged = True
doc3 = Doc(en_vocab, words=["Test"])
# DEP requires is_parsed
matcher = PhraseMatcher(en_vocab, attr="DEP")
matcher.add("TEST1", None, doc1)
with pytest.raises(ValueError):
matcher.add("TEST2", None, doc2)
with pytest.raises(ValueError):
matcher.add("TEST3", None, doc3)
# TAG, POS, LEMMA require is_tagged
for attr in ("TAG", "POS", "LEMMA"):
matcher = PhraseMatcher(en_vocab, attr=attr)
matcher.add("TEST2", None, doc2)
with pytest.raises(ValueError):
matcher.add("TEST1", None, doc1)
with pytest.raises(ValueError):
matcher.add("TEST3", None, doc3)
# TEXT/ORTH only require tokens
matcher = PhraseMatcher(en_vocab, attr="ORTH")
matcher.add("TEST3", None, doc3)
matcher = PhraseMatcher(en_vocab, attr="TEXT")
matcher.add("TEST3", None, doc3)

View File

@ -137,7 +137,8 @@ def test_entity_ruler_validate(nlp):
valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]} valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]}
invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]} invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]}
# invalid pattern is added without errors without validate # invalid pattern raises error without validate
with pytest.raises(ValueError):
ruler.add_patterns([invalid_pattern]) ruler.add_patterns([invalid_pattern])
# valid pattern is added without errors with validate # valid pattern is added without errors with validate

View File

@ -859,12 +859,12 @@ token pattern covering the exact tokenization of the term.
<Infobox title="Important note on creating patterns" variant="warning"> <Infobox title="Important note on creating patterns" variant="warning">
To create the patterns, each phrase has to be processed with the `nlp` object. To create the patterns, each phrase has to be processed with the `nlp` object.
If you have a mode loaded, doing this in a loop or list comprehension can easily If you have a model loaded, doing this in a loop or list comprehension can
become inefficient and slow. If you **only need the tokenization and lexical easily become inefficient and slow. If you **only need the tokenization and
attributes**, you can run [`nlp.make_doc`](/api/language#make_doc) instead, lexical attributes**, you can run [`nlp.make_doc`](/api/language#make_doc)
which will only run the tokenizer. For an additional speed boost, you can also instead, which will only run the tokenizer. For an additional speed boost, you
use the [`nlp.tokenizer.pipe`](/api/tokenizer#pipe) method, which will process can also use the [`nlp.tokenizer.pipe`](/api/tokenizer#pipe) method, which will
the texts as a stream. process the texts as a stream.
```diff ```diff
- patterns = [nlp(term) for term in LOTS_OF_TERMS] - patterns = [nlp(term) for term in LOTS_OF_TERMS]