mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Improve token pattern checking without validation (#4105)
* Fix typo in rule-based matching docs * Improve token pattern checking without validation Add more detailed token pattern checks without full JSON pattern validation and provide more detailed error messages. Addresses #4070 (also related: #4063, #4100). * Check whether top-level attributes in patterns and attr for PhraseMatcher are in token pattern schema * Check whether attribute value types are supported in general (as opposed to per attribute with full validation) * Report various internal error types (OverflowError, AttributeError, KeyError) as ValueError with standard error messages * Check for tagger/parser in PhraseMatcher pipeline for attributes TAG, POS, LEMMA, and DEP * Add error messages with relevant details on how to use validate=True or nlp() instead of nlp.make_doc() * Support attr=TEXT for PhraseMatcher * Add NORM to schema * Expand tests for pattern validation, Matcher, PhraseMatcher, and EntityRuler * Remove unnecessary .keys() * Rephrase error messages * Add another type check to Matcher Add another type check to Matcher for more understandable error messages in some rare cases. * Support phrase_matcher_attr=TEXT for EntityRuler * Don't use spacy.errors in examples and bin scripts * Fix error code * Auto-format Also try get Azure pipelines to finally start a build :( * Update errors.py Co-authored-by: Ines Montani <ines@ines.io> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
parent
3134a9b6e0
commit
8fe7bdd0fa
|
@ -8,8 +8,6 @@ from spacy.kb import KnowledgeBase
|
||||||
import csv
|
import csv
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
from spacy import Errors
|
|
||||||
|
|
||||||
|
|
||||||
def create_kb(
|
def create_kb(
|
||||||
nlp,
|
nlp,
|
||||||
|
@ -33,7 +31,10 @@ def create_kb(
|
||||||
input_dim = nlp.vocab.vectors_length
|
input_dim = nlp.vocab.vectors_length
|
||||||
print("Loaded pre-trained vectors of size %s" % input_dim)
|
print("Loaded pre-trained vectors of size %s" % input_dim)
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E155)
|
raise ValueError(
|
||||||
|
"The `nlp` object should have access to pre-trained word vectors, "
|
||||||
|
" cf. https://spacy.io/usage/models#languages."
|
||||||
|
)
|
||||||
|
|
||||||
# disable this part of the pipeline when rerunning the KB generation from preprocessed files
|
# disable this part of the pipeline when rerunning the KB generation from preprocessed files
|
||||||
if read_raw_data:
|
if read_raw_data:
|
||||||
|
|
|
@ -73,7 +73,10 @@ def main(
|
||||||
|
|
||||||
# check the length of the nlp vectors
|
# check the length of the nlp vectors
|
||||||
if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
|
if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
|
||||||
raise ValueError(Errors.E155)
|
raise ValueError(
|
||||||
|
"The `nlp` object should have access to pre-trained word vectors, "
|
||||||
|
" cf. https://spacy.io/usage/models#languages."
|
||||||
|
)
|
||||||
|
|
||||||
# STEP 2: create prior probabilities from WP
|
# STEP 2: create prior probabilities from WP
|
||||||
print()
|
print()
|
||||||
|
|
|
@ -19,8 +19,6 @@ from bin.wiki_entity_linking import training_set_creator
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.kb import KnowledgeBase
|
from spacy.kb import KnowledgeBase
|
||||||
|
|
||||||
from spacy import Errors
|
|
||||||
from spacy.util import minibatch, compounding
|
from spacy.util import minibatch, compounding
|
||||||
|
|
||||||
|
|
||||||
|
@ -68,7 +66,7 @@ def main(
|
||||||
|
|
||||||
# check that there is a NER component in the pipeline
|
# check that there is a NER component in the pipeline
|
||||||
if "ner" not in nlp.pipe_names:
|
if "ner" not in nlp.pipe_names:
|
||||||
raise ValueError(Errors.E152)
|
raise ValueError("The `nlp` object should have a pre-trained `ner` component.")
|
||||||
|
|
||||||
# STEP 2 : read the KB
|
# STEP 2 : read the KB
|
||||||
print()
|
print()
|
||||||
|
@ -82,7 +80,10 @@ def main(
|
||||||
print(now(), "STEP 3: reading training dataset from", loc_training)
|
print(now(), "STEP 3: reading training dataset from", loc_training)
|
||||||
else:
|
else:
|
||||||
if not wp_xml:
|
if not wp_xml:
|
||||||
raise ValueError(Errors.E153)
|
raise ValueError(
|
||||||
|
"Either provide a path to a preprocessed training directory, "
|
||||||
|
"or to the original Wikipedia XML dump."
|
||||||
|
)
|
||||||
|
|
||||||
if output_dir:
|
if output_dir:
|
||||||
loc_training = output_dir / "training_data"
|
loc_training = output_dir / "training_data"
|
||||||
|
|
|
@ -17,12 +17,10 @@ import plac
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.kb import KnowledgeBase
|
from spacy.kb import KnowledgeBase
|
||||||
|
|
||||||
from bin.wiki_entity_linking.train_descriptions import EntityEncoder
|
from bin.wiki_entity_linking.train_descriptions import EntityEncoder
|
||||||
from spacy import Errors
|
|
||||||
|
|
||||||
|
|
||||||
# Q2146908 (Russ Cochran): American golfer
|
# Q2146908 (Russ Cochran): American golfer
|
||||||
|
@ -45,7 +43,7 @@ def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
|
||||||
If an output_dir is provided, the KB will be stored there in a file 'kb'.
|
If an output_dir is provided, the KB will be stored there in a file 'kb'.
|
||||||
When providing an nlp model, the updated vocab will also be written to a directory in the output_dir."""
|
When providing an nlp model, the updated vocab will also be written to a directory in the output_dir."""
|
||||||
if model is None and vocab_path is None:
|
if model is None and vocab_path is None:
|
||||||
raise ValueError(Errors.E154)
|
raise ValueError("Either the `nlp` model or the `vocab` should be specified.")
|
||||||
|
|
||||||
if model is not None:
|
if model is not None:
|
||||||
nlp = spacy.load(model) # load existing spaCy model
|
nlp = spacy.load(model) # load existing spaCy model
|
||||||
|
|
|
@ -22,8 +22,6 @@ from spacy.vocab import Vocab
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.kb import KnowledgeBase
|
from spacy.kb import KnowledgeBase
|
||||||
|
|
||||||
from spacy import Errors
|
|
||||||
from spacy.tokens import Span
|
from spacy.tokens import Span
|
||||||
from spacy.util import minibatch, compounding
|
from spacy.util import minibatch, compounding
|
||||||
|
|
||||||
|
|
|
@ -128,7 +128,7 @@ class DependencyRenderer(object):
|
||||||
"""
|
"""
|
||||||
if start < 0 or end < 0:
|
if start < 0 or end < 0:
|
||||||
error_args = dict(start=start, end=end, label=label, dir=direction)
|
error_args = dict(start=start, end=end, label=label, dir=direction)
|
||||||
raise ValueError(Errors.E156.format(**error_args))
|
raise ValueError(Errors.E157.format(**error_args))
|
||||||
level = self.levels.index(end - start) + 1
|
level = self.levels.index(end - start) + 1
|
||||||
x_start = self.offset_x + start * self.distance + self.arrow_spacing
|
x_start = self.offset_x + start * self.distance + self.arrow_spacing
|
||||||
if self.direction == "rtl":
|
if self.direction == "rtl":
|
||||||
|
|
|
@ -431,13 +431,24 @@ class Errors(object):
|
||||||
"same, but found '{nlp}' and '{vocab}' respectively.")
|
"same, but found '{nlp}' and '{vocab}' respectively.")
|
||||||
E151 = ("Trying to call nlp.update without required annotation types. "
|
E151 = ("Trying to call nlp.update without required annotation types. "
|
||||||
"Expected top-level keys: {exp}. Got: {unexp}.")
|
"Expected top-level keys: {exp}. Got: {unexp}.")
|
||||||
E152 = ("The `nlp` object should have a pre-trained `ner` component.")
|
E152 = ("The attribute {attr} is not supported for token patterns. "
|
||||||
E153 = ("Either provide a path to a preprocessed training directory, "
|
"Please use the option validate=True with Matcher, PhraseMatcher, "
|
||||||
"or to the original Wikipedia XML dump.")
|
"or EntityRuler for more details.")
|
||||||
E154 = ("Either the `nlp` model or the `vocab` should be specified.")
|
E153 = ("The value type {vtype} is not supported for token patterns. "
|
||||||
E155 = ("The `nlp` object should have access to pre-trained word vectors, "
|
"Please use the option validate=True with Matcher, PhraseMatcher, "
|
||||||
" cf. https://spacy.io/usage/models#languages.")
|
"or EntityRuler for more details.")
|
||||||
E156 = ("Can't render negative values for dependency arc start or end. "
|
E154 = ("One of the attributes or values is not supported for token "
|
||||||
|
"patterns. Please use the option validate=True with Matcher, "
|
||||||
|
"PhraseMatcher, or EntityRuler for more details.")
|
||||||
|
E155 = ("The pipeline needs to include a tagger in order to use "
|
||||||
|
"PhraseMatcher with the attributes POS, TAG, or LEMMA. Try using "
|
||||||
|
"nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
|
||||||
|
"list(nlp.tokenizer.pipe()).")
|
||||||
|
E156 = ("The pipeline needs to include a parser in order to use "
|
||||||
|
"PhraseMatcher with the attribute DEP. Try using "
|
||||||
|
"nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
|
||||||
|
"list(nlp.tokenizer.pipe()).")
|
||||||
|
E157 = ("Can't render negative values for dependency arc start or end. "
|
||||||
"Make sure that you're passing in absolute token indices, not "
|
"Make sure that you're passing in absolute token indices, not "
|
||||||
"relative token offsets.\nstart: {start}, end: {end}, label: "
|
"relative token offsets.\nstart: {start}, end: {end}, label: "
|
||||||
"{label}, direction: {dir}")
|
"{label}, direction: {dir}")
|
||||||
|
|
|
@ -102,6 +102,10 @@ TOKEN_PATTERN_SCHEMA = {
|
||||||
"title": "Entity label of single token",
|
"title": "Entity label of single token",
|
||||||
"$ref": "#/definitions/string_value",
|
"$ref": "#/definitions/string_value",
|
||||||
},
|
},
|
||||||
|
"NORM": {
|
||||||
|
"title": "Normalized form of the token text",
|
||||||
|
"$ref": "#/definitions/string_value",
|
||||||
|
},
|
||||||
"LENGTH": {
|
"LENGTH": {
|
||||||
"title": "Token character length",
|
"title": "Token character length",
|
||||||
"$ref": "#/definitions/integer_value",
|
"$ref": "#/definitions/integer_value",
|
||||||
|
|
|
@ -112,9 +112,12 @@ cdef class Matcher:
|
||||||
raise MatchPatternError(key, errors)
|
raise MatchPatternError(key, errors)
|
||||||
key = self._normalize_key(key)
|
key = self._normalize_key(key)
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
|
try:
|
||||||
specs = _preprocess_pattern(pattern, self.vocab.strings,
|
specs = _preprocess_pattern(pattern, self.vocab.strings,
|
||||||
self._extensions, self._extra_predicates)
|
self._extensions, self._extra_predicates)
|
||||||
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
||||||
|
except OverflowError, AttributeError:
|
||||||
|
raise ValueError(Errors.E154.format())
|
||||||
self._patterns.setdefault(key, [])
|
self._patterns.setdefault(key, [])
|
||||||
self._callbacks[key] = on_match
|
self._callbacks[key] = on_match
|
||||||
self._patterns[key].extend(patterns)
|
self._patterns[key].extend(patterns)
|
||||||
|
@ -568,6 +571,8 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
|
||||||
# Signifier for 'any token'
|
# Signifier for 'any token'
|
||||||
tokens.append((ONE, [(NULL_ATTR, 0)], [], []))
|
tokens.append((ONE, [(NULL_ATTR, 0)], [], []))
|
||||||
continue
|
continue
|
||||||
|
if not isinstance(spec, dict):
|
||||||
|
raise ValueError(Errors.E154.format())
|
||||||
ops = _get_operators(spec)
|
ops = _get_operators(spec)
|
||||||
attr_values = _get_attr_values(spec, string_store)
|
attr_values = _get_attr_values(spec, string_store)
|
||||||
extensions = _get_extensions(spec, string_store, extensions_table)
|
extensions = _get_extensions(spec, string_store, extensions_table)
|
||||||
|
@ -581,21 +586,29 @@ def _get_attr_values(spec, string_store):
|
||||||
attr_values = []
|
attr_values = []
|
||||||
for attr, value in spec.items():
|
for attr, value in spec.items():
|
||||||
if isinstance(attr, basestring):
|
if isinstance(attr, basestring):
|
||||||
|
attr = attr.upper()
|
||||||
if attr == '_':
|
if attr == '_':
|
||||||
continue
|
continue
|
||||||
elif attr.upper() == "OP":
|
elif attr == "OP":
|
||||||
continue
|
continue
|
||||||
if attr.upper() == "TEXT":
|
if attr == "TEXT":
|
||||||
attr = "ORTH"
|
attr = "ORTH"
|
||||||
attr = IDS.get(attr.upper())
|
if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]:
|
||||||
|
raise ValueError(Errors.E152.format(attr=attr))
|
||||||
|
attr = IDS.get(attr)
|
||||||
if isinstance(value, basestring):
|
if isinstance(value, basestring):
|
||||||
value = string_store.add(value)
|
value = string_store.add(value)
|
||||||
elif isinstance(value, bool):
|
elif isinstance(value, bool):
|
||||||
value = int(value)
|
value = int(value)
|
||||||
elif isinstance(value, dict):
|
elif isinstance(value, dict):
|
||||||
continue
|
continue
|
||||||
|
else:
|
||||||
|
raise ValueError(Errors.E153.format(vtype=type(value).__name__))
|
||||||
if attr is not None:
|
if attr is not None:
|
||||||
attr_values.append((attr, value))
|
attr_values.append((attr, value))
|
||||||
|
else:
|
||||||
|
# should be caught above using TOKEN_PATTERN_SCHEMA
|
||||||
|
raise ValueError(Errors.E152.format(attr=attr))
|
||||||
return attr_values
|
return attr_values
|
||||||
|
|
||||||
|
|
||||||
|
@ -755,11 +768,13 @@ def _get_operators(spec):
|
||||||
return lookup[spec["OP"]]
|
return lookup[spec["OP"]]
|
||||||
else:
|
else:
|
||||||
keys = ", ".join(lookup.keys())
|
keys = ", ".join(lookup.keys())
|
||||||
raise KeyError(Errors.E011.format(op=spec["OP"], opts=keys))
|
raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
|
||||||
|
|
||||||
|
|
||||||
def _get_extensions(spec, string_store, name2index):
|
def _get_extensions(spec, string_store, name2index):
|
||||||
attr_values = []
|
attr_values = []
|
||||||
|
if not isinstance(spec.get("_", {}), dict):
|
||||||
|
raise ValueError(Errors.E154.format())
|
||||||
for name, value in spec.get("_", {}).items():
|
for name, value in spec.get("_", {}).items():
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
# Handle predicates (e.g. "IN", in the extra_predicates, not here.
|
# Handle predicates (e.g. "IN", in the extra_predicates, not here.
|
||||||
|
|
|
@ -12,6 +12,7 @@ from ..vocab cimport Vocab
|
||||||
from ..tokens.doc cimport Doc, get_token_attr
|
from ..tokens.doc cimport Doc, get_token_attr
|
||||||
from ..typedefs cimport attr_t, hash_t
|
from ..typedefs cimport attr_t, hash_t
|
||||||
|
|
||||||
|
from ._schemas import TOKEN_PATTERN_SCHEMA
|
||||||
from ..errors import Errors, Warnings, deprecation_warning, user_warning
|
from ..errors import Errors, Warnings, deprecation_warning, user_warning
|
||||||
from ..attrs import FLAG61 as U_ENT
|
from ..attrs import FLAG61 as U_ENT
|
||||||
from ..attrs import FLAG60 as B2_ENT
|
from ..attrs import FLAG60 as B2_ENT
|
||||||
|
@ -62,6 +63,11 @@ cdef class PhraseMatcher:
|
||||||
if isinstance(attr, long):
|
if isinstance(attr, long):
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
else:
|
else:
|
||||||
|
attr = attr.upper()
|
||||||
|
if attr == "TEXT":
|
||||||
|
attr = "ORTH"
|
||||||
|
if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]:
|
||||||
|
raise ValueError(Errors.E152.format(attr=attr))
|
||||||
self.attr = self.vocab.strings[attr]
|
self.attr = self.vocab.strings[attr]
|
||||||
self.phrase_ids = PreshMap()
|
self.phrase_ids = PreshMap()
|
||||||
abstract_patterns = [
|
abstract_patterns = [
|
||||||
|
@ -123,6 +129,10 @@ cdef class PhraseMatcher:
|
||||||
length = doc.length
|
length = doc.length
|
||||||
if length == 0:
|
if length == 0:
|
||||||
continue
|
continue
|
||||||
|
if self.attr in (POS, TAG, LEMMA) and not doc.is_tagged:
|
||||||
|
raise ValueError(Errors.E155.format())
|
||||||
|
if self.attr == DEP and not doc.is_parsed:
|
||||||
|
raise ValueError(Errors.E156.format())
|
||||||
if self._validate and (doc.is_tagged or doc.is_parsed) \
|
if self._validate and (doc.is_tagged or doc.is_parsed) \
|
||||||
and self.attr not in (DEP, POS, TAG, LEMMA):
|
and self.attr not in (DEP, POS, TAG, LEMMA):
|
||||||
string_attr = self.vocab.strings[self.attr]
|
string_attr = self.vocab.strings[self.attr]
|
||||||
|
|
|
@ -54,6 +54,8 @@ class EntityRuler(object):
|
||||||
self.phrase_patterns = defaultdict(list)
|
self.phrase_patterns = defaultdict(list)
|
||||||
self.matcher = Matcher(nlp.vocab, validate=validate)
|
self.matcher = Matcher(nlp.vocab, validate=validate)
|
||||||
if phrase_matcher_attr is not None:
|
if phrase_matcher_attr is not None:
|
||||||
|
if phrase_matcher_attr.upper() == "TEXT":
|
||||||
|
phrase_matcher_attr = "ORTH"
|
||||||
self.phrase_matcher_attr = phrase_matcher_attr
|
self.phrase_matcher_attr = phrase_matcher_attr
|
||||||
self.phrase_matcher = PhraseMatcher(
|
self.phrase_matcher = PhraseMatcher(
|
||||||
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
|
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
|
||||||
|
|
|
@ -7,6 +7,36 @@ from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA
|
||||||
from spacy.errors import MatchPatternError
|
from spacy.errors import MatchPatternError
|
||||||
from spacy.util import get_json_validator, validate_json
|
from spacy.util import get_json_validator, validate_json
|
||||||
|
|
||||||
|
# (pattern, num errors with validation, num errors identified with minimal
|
||||||
|
# checks)
|
||||||
|
TEST_PATTERNS = [
|
||||||
|
# Bad patterns flagged in all cases
|
||||||
|
([{"XX": "foo"}], 1, 1),
|
||||||
|
([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2, 1),
|
||||||
|
([{"IS_ALPHA": {"==": True}}, {"LIKE_NUM": None}], 2, 1),
|
||||||
|
([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
|
||||||
|
([{"IS_DIGIT": -1}], 1, 1),
|
||||||
|
([{"ORTH": -1}], 1, 1),
|
||||||
|
([{"_": "foo"}], 1, 1),
|
||||||
|
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
|
||||||
|
([1, 2, 3], 3, 1),
|
||||||
|
# Bad patterns flagged outside of Matcher
|
||||||
|
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1, 0),
|
||||||
|
# Bad patterns not flagged with minimal checks
|
||||||
|
([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2, 0),
|
||||||
|
([{"LENGTH": {"VALUE": 5}}], 1, 0),
|
||||||
|
([{"TEXT": {"VALUE": "foo"}}], 1, 0),
|
||||||
|
# Good patterns
|
||||||
|
([{"TEXT": "foo"}, {"LOWER": "bar"}], 0, 0),
|
||||||
|
([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0, 0),
|
||||||
|
([{"LIKE_NUM": True, "LENGTH": {">=": 5}}], 0, 0),
|
||||||
|
([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0, 0),
|
||||||
|
([{"NORM": "a"}, {"POS": {"IN": ["NOUN"]}}], 0, 0),
|
||||||
|
([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0, 0),
|
||||||
|
]
|
||||||
|
|
||||||
|
XFAIL_TEST_PATTERNS = [([{"orth": "foo"}], 0, 0)]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def validator():
|
def validator():
|
||||||
|
@ -22,27 +52,24 @@ def test_matcher_pattern_validation(en_vocab, pattern):
|
||||||
matcher.add("TEST", None, pattern)
|
matcher.add("TEST", None, pattern)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize("pattern,n_errors,_", TEST_PATTERNS)
|
||||||
"pattern,n_errors",
|
def test_pattern_validation(validator, pattern, n_errors, _):
|
||||||
[
|
|
||||||
# Bad patterns
|
|
||||||
([{"XX": "foo"}], 1),
|
|
||||||
([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2),
|
|
||||||
([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2),
|
|
||||||
([{"IS_ALPHA": {"==": True}}, {"LIKE_NUM": None}], 2),
|
|
||||||
([{"TEXT": {"VALUE": "foo"}}], 1),
|
|
||||||
([{"LENGTH": {"VALUE": 5}}], 1),
|
|
||||||
([{"_": "foo"}], 1),
|
|
||||||
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1),
|
|
||||||
([{"IS_PUNCT": True, "OP": "$"}], 1),
|
|
||||||
# Good patterns
|
|
||||||
([{"TEXT": "foo"}, {"LOWER": "bar"}], 0),
|
|
||||||
([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0),
|
|
||||||
([{"LIKE_NUM": True, "LENGTH": {">=": 5}}], 0),
|
|
||||||
([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0),
|
|
||||||
([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_pattern_validation(validator, pattern, n_errors):
|
|
||||||
errors = validate_json(pattern, validator)
|
errors = validate_json(pattern, validator)
|
||||||
assert len(errors) == n_errors
|
assert len(errors) == n_errors
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
@pytest.mark.parametrize("pattern,n_errors,_", XFAIL_TEST_PATTERNS)
|
||||||
|
def test_xfail_pattern_validation(validator, pattern, n_errors, _):
|
||||||
|
errors = validate_json(pattern, validator)
|
||||||
|
assert len(errors) == n_errors
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("pattern,n_errors,n_min_errors", TEST_PATTERNS)
|
||||||
|
def test_minimal_pattern_validation(en_vocab, pattern, n_errors, n_min_errors):
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
if n_min_errors > 0:
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
matcher.add("TEST", None, pattern)
|
||||||
|
elif n_errors == 0:
|
||||||
|
matcher.add("TEST", None, pattern)
|
||||||
|
|
|
@ -99,3 +99,36 @@ def test_phrase_matcher_validation(en_vocab):
|
||||||
with pytest.warns(None) as record:
|
with pytest.warns(None) as record:
|
||||||
matcher.add("TEST4", None, doc2)
|
matcher.add("TEST4", None, doc2)
|
||||||
assert not record.list
|
assert not record.list
|
||||||
|
|
||||||
|
|
||||||
|
def test_attr_validation(en_vocab):
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
PhraseMatcher(en_vocab, attr="UNSUPPORTED")
|
||||||
|
|
||||||
|
|
||||||
|
def test_attr_pipeline_checks(en_vocab):
|
||||||
|
doc1 = Doc(en_vocab, words=["Test"])
|
||||||
|
doc1.is_parsed = True
|
||||||
|
doc2 = Doc(en_vocab, words=["Test"])
|
||||||
|
doc2.is_tagged = True
|
||||||
|
doc3 = Doc(en_vocab, words=["Test"])
|
||||||
|
# DEP requires is_parsed
|
||||||
|
matcher = PhraseMatcher(en_vocab, attr="DEP")
|
||||||
|
matcher.add("TEST1", None, doc1)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
matcher.add("TEST2", None, doc2)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
matcher.add("TEST3", None, doc3)
|
||||||
|
# TAG, POS, LEMMA require is_tagged
|
||||||
|
for attr in ("TAG", "POS", "LEMMA"):
|
||||||
|
matcher = PhraseMatcher(en_vocab, attr=attr)
|
||||||
|
matcher.add("TEST2", None, doc2)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
matcher.add("TEST1", None, doc1)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
matcher.add("TEST3", None, doc3)
|
||||||
|
# TEXT/ORTH only require tokens
|
||||||
|
matcher = PhraseMatcher(en_vocab, attr="ORTH")
|
||||||
|
matcher.add("TEST3", None, doc3)
|
||||||
|
matcher = PhraseMatcher(en_vocab, attr="TEXT")
|
||||||
|
matcher.add("TEST3", None, doc3)
|
||||||
|
|
|
@ -137,7 +137,8 @@ def test_entity_ruler_validate(nlp):
|
||||||
valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]}
|
valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]}
|
||||||
invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]}
|
invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]}
|
||||||
|
|
||||||
# invalid pattern is added without errors without validate
|
# invalid pattern raises error without validate
|
||||||
|
with pytest.raises(ValueError):
|
||||||
ruler.add_patterns([invalid_pattern])
|
ruler.add_patterns([invalid_pattern])
|
||||||
|
|
||||||
# valid pattern is added without errors with validate
|
# valid pattern is added without errors with validate
|
||||||
|
|
|
@ -859,12 +859,12 @@ token pattern covering the exact tokenization of the term.
|
||||||
<Infobox title="Important note on creating patterns" variant="warning">
|
<Infobox title="Important note on creating patterns" variant="warning">
|
||||||
|
|
||||||
To create the patterns, each phrase has to be processed with the `nlp` object.
|
To create the patterns, each phrase has to be processed with the `nlp` object.
|
||||||
If you have a mode loaded, doing this in a loop or list comprehension can easily
|
If you have a model loaded, doing this in a loop or list comprehension can
|
||||||
become inefficient and slow. If you **only need the tokenization and lexical
|
easily become inefficient and slow. If you **only need the tokenization and
|
||||||
attributes**, you can run [`nlp.make_doc`](/api/language#make_doc) instead,
|
lexical attributes**, you can run [`nlp.make_doc`](/api/language#make_doc)
|
||||||
which will only run the tokenizer. For an additional speed boost, you can also
|
instead, which will only run the tokenizer. For an additional speed boost, you
|
||||||
use the [`nlp.tokenizer.pipe`](/api/tokenizer#pipe) method, which will process
|
can also use the [`nlp.tokenizer.pipe`](/api/tokenizer#pipe) method, which will
|
||||||
the texts as a stream.
|
process the texts as a stream.
|
||||||
|
|
||||||
```diff
|
```diff
|
||||||
- patterns = [nlp(term) for term in LOTS_OF_TERMS]
|
- patterns = [nlp(term) for term in LOTS_OF_TERMS]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user