Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2019-08-21 21:36:10 +02:00
commit 073e8d647c
23 changed files with 271 additions and 65 deletions

View File

@ -8,8 +8,6 @@ from spacy.kb import KnowledgeBase
import csv import csv
import datetime import datetime
from spacy import Errors
def create_kb( def create_kb(
nlp, nlp,
@ -33,7 +31,10 @@ def create_kb(
input_dim = nlp.vocab.vectors_length input_dim = nlp.vocab.vectors_length
print("Loaded pre-trained vectors of size %s" % input_dim) print("Loaded pre-trained vectors of size %s" % input_dim)
else: else:
raise ValueError(Errors.E155) raise ValueError(
"The `nlp` object should have access to pre-trained word vectors, "
" cf. https://spacy.io/usage/models#languages."
)
# disable this part of the pipeline when rerunning the KB generation from preprocessed files # disable this part of the pipeline when rerunning the KB generation from preprocessed files
if read_raw_data: if read_raw_data:

View File

@ -73,7 +73,10 @@ def main(
# check the length of the nlp vectors # check the length of the nlp vectors
if "vectors" not in nlp.meta or not nlp.vocab.vectors.size: if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
raise ValueError(Errors.E155) raise ValueError(
"The `nlp` object should have access to pre-trained word vectors, "
" cf. https://spacy.io/usage/models#languages."
)
# STEP 2: create prior probabilities from WP # STEP 2: create prior probabilities from WP
print() print()

View File

@ -19,8 +19,6 @@ from bin.wiki_entity_linking import training_set_creator
import spacy import spacy
from spacy.kb import KnowledgeBase from spacy.kb import KnowledgeBase
from spacy import Errors
from spacy.util import minibatch, compounding from spacy.util import minibatch, compounding
@ -68,7 +66,7 @@ def main(
# check that there is a NER component in the pipeline # check that there is a NER component in the pipeline
if "ner" not in nlp.pipe_names: if "ner" not in nlp.pipe_names:
raise ValueError(Errors.E152) raise ValueError("The `nlp` object should have a pre-trained `ner` component.")
# STEP 2 : read the KB # STEP 2 : read the KB
print() print()
@ -82,7 +80,10 @@ def main(
print(now(), "STEP 3: reading training dataset from", loc_training) print(now(), "STEP 3: reading training dataset from", loc_training)
else: else:
if not wp_xml: if not wp_xml:
raise ValueError(Errors.E153) raise ValueError(
"Either provide a path to a preprocessed training directory, "
"or to the original Wikipedia XML dump."
)
if output_dir: if output_dir:
loc_training = output_dir / "training_data" loc_training = output_dir / "training_data"

View File

@ -17,12 +17,10 @@ import plac
from pathlib import Path from pathlib import Path
from spacy.vocab import Vocab from spacy.vocab import Vocab
import spacy import spacy
from spacy.kb import KnowledgeBase from spacy.kb import KnowledgeBase
from bin.wiki_entity_linking.train_descriptions import EntityEncoder from bin.wiki_entity_linking.train_descriptions import EntityEncoder
from spacy import Errors
# Q2146908 (Russ Cochran): American golfer # Q2146908 (Russ Cochran): American golfer
@ -45,7 +43,7 @@ def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
If an output_dir is provided, the KB will be stored there in a file 'kb'. If an output_dir is provided, the KB will be stored there in a file 'kb'.
When providing an nlp model, the updated vocab will also be written to a directory in the output_dir.""" When providing an nlp model, the updated vocab will also be written to a directory in the output_dir."""
if model is None and vocab_path is None: if model is None and vocab_path is None:
raise ValueError(Errors.E154) raise ValueError("Either the `nlp` model or the `vocab` should be specified.")
if model is not None: if model is not None:
nlp = spacy.load(model) # load existing spaCy model nlp = spacy.load(model) # load existing spaCy model

View File

@ -22,8 +22,6 @@ from spacy.vocab import Vocab
import spacy import spacy
from spacy.kb import KnowledgeBase from spacy.kb import KnowledgeBase
from spacy import Errors
from spacy.tokens import Span from spacy.tokens import Span
from spacy.util import minibatch, compounding from spacy.util import minibatch, compounding

View File

@ -128,7 +128,7 @@ class DependencyRenderer(object):
""" """
if start < 0 or end < 0: if start < 0 or end < 0:
error_args = dict(start=start, end=end, label=label, dir=direction) error_args = dict(start=start, end=end, label=label, dir=direction)
raise ValueError(Errors.E156.format(**error_args)) raise ValueError(Errors.E157.format(**error_args))
level = self.levels.index(end - start) + 1 level = self.levels.index(end - start) + 1
x_start = self.offset_x + start * self.distance + self.arrow_spacing x_start = self.offset_x + start * self.distance + self.arrow_spacing
if self.direction == "rtl": if self.direction == "rtl":

View File

@ -431,13 +431,24 @@ class Errors(object):
"same, but found '{nlp}' and '{vocab}' respectively.") "same, but found '{nlp}' and '{vocab}' respectively.")
E151 = ("Trying to call nlp.update without required annotation types. " E151 = ("Trying to call nlp.update without required annotation types. "
"Expected top-level keys: {exp}. Got: {unexp}.") "Expected top-level keys: {exp}. Got: {unexp}.")
E152 = ("The `nlp` object should have a pre-trained `ner` component.") E152 = ("The attribute {attr} is not supported for token patterns. "
E153 = ("Either provide a path to a preprocessed training directory, " "Please use the option validate=True with Matcher, PhraseMatcher, "
"or to the original Wikipedia XML dump.") "or EntityRuler for more details.")
E154 = ("Either the `nlp` model or the `vocab` should be specified.") E153 = ("The value type {vtype} is not supported for token patterns. "
E155 = ("The `nlp` object should have access to pre-trained word vectors, " "Please use the option validate=True with Matcher, PhraseMatcher, "
" cf. https://spacy.io/usage/models#languages.") "or EntityRuler for more details.")
E156 = ("Can't render negative values for dependency arc start or end. " E154 = ("One of the attributes or values is not supported for token "
"patterns. Please use the option validate=True with Matcher, "
"PhraseMatcher, or EntityRuler for more details.")
E155 = ("The pipeline needs to include a tagger in order to use "
"Matcher or PhraseMatcher with the attributes POS, TAG, or LEMMA. "
"Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
"instead of list(nlp.tokenizer.pipe()).")
E156 = ("The pipeline needs to include a parser in order to use "
"Matcher or PhraseMatcher with the attribute DEP. Try using "
"nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
"list(nlp.tokenizer.pipe()).")
E157 = ("Can't render negative values for dependency arc start or end. "
"Make sure that you're passing in absolute token indices, not " "Make sure that you're passing in absolute token indices, not "
"relative token offsets.\nstart: {start}, end: {end}, label: " "relative token offsets.\nstart: {start}, end: {end}, label: "
"{label}, direction: {dir}") "{label}, direction: {dir}")

View File

@ -12,7 +12,7 @@ from ...util import update_exc, add_lookups
class SerbianDefaults(Language.Defaults): class SerbianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "rs" lex_attr_getters[LANG] = lambda text: "sr"
lex_attr_getters[NORM] = add_lookups( lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
) )
@ -21,7 +21,7 @@ class SerbianDefaults(Language.Defaults):
class Serbian(Language): class Serbian(Language):
lang = "rs" lang = "sr"
Defaults = SerbianDefaults Defaults = SerbianDefaults

View File

@ -102,6 +102,10 @@ TOKEN_PATTERN_SCHEMA = {
"title": "Entity label of single token", "title": "Entity label of single token",
"$ref": "#/definitions/string_value", "$ref": "#/definitions/string_value",
}, },
"NORM": {
"title": "Normalized form of the token text",
"$ref": "#/definitions/string_value",
},
"LENGTH": { "LENGTH": {
"title": "Token character length", "title": "Token character length",
"$ref": "#/definitions/integer_value", "$ref": "#/definitions/integer_value",

View File

@ -67,3 +67,4 @@ cdef class Matcher:
cdef public object _callbacks cdef public object _callbacks
cdef public object _extensions cdef public object _extensions
cdef public object _extra_predicates cdef public object _extra_predicates
cdef public object _seen_attrs

View File

@ -15,7 +15,7 @@ from ..structs cimport TokenC
from ..vocab cimport Vocab from ..vocab cimport Vocab
from ..tokens.doc cimport Doc, get_token_attr from ..tokens.doc cimport Doc, get_token_attr
from ..tokens.token cimport Token from ..tokens.token cimport Token
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA
from ._schemas import TOKEN_PATTERN_SCHEMA from ._schemas import TOKEN_PATTERN_SCHEMA
from ..util import get_json_validator, validate_json from ..util import get_json_validator, validate_json
@ -45,7 +45,7 @@ cdef class Matcher:
self._patterns = {} self._patterns = {}
self._callbacks = {} self._callbacks = {}
self._extensions = {} self._extensions = {}
self._extra_predicates = [] self._seen_attrs = set()
self.vocab = vocab self.vocab = vocab
self.mem = Pool() self.mem = Pool()
if validate: if validate:
@ -112,9 +112,15 @@ cdef class Matcher:
raise MatchPatternError(key, errors) raise MatchPatternError(key, errors)
key = self._normalize_key(key) key = self._normalize_key(key)
for pattern in patterns: for pattern in patterns:
try:
specs = _preprocess_pattern(pattern, self.vocab.strings, specs = _preprocess_pattern(pattern, self.vocab.strings,
self._extensions, self._extra_predicates) self._extensions, self._extra_predicates)
self.patterns.push_back(init_pattern(self.mem, key, specs)) self.patterns.push_back(init_pattern(self.mem, key, specs))
for spec in specs:
for attr, _ in spec[1]:
self._seen_attrs.add(attr)
except OverflowError, AttributeError:
raise ValueError(Errors.E154.format())
self._patterns.setdefault(key, []) self._patterns.setdefault(key, [])
self._callbacks[key] = on_match self._callbacks[key] = on_match
self._patterns[key].extend(patterns) self._patterns[key].extend(patterns)
@ -177,6 +183,11 @@ cdef class Matcher:
describing the matches. A match tuple describes a span describing the matches. A match tuple describes a span
`doc[start:end]`. The `label_id` and `key` are both integers. `doc[start:end]`. The `label_id` and `key` are both integers.
""" """
if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \
and not doc.is_tagged:
raise ValueError(Errors.E155.format())
if DEP in self._seen_attrs and not doc.is_parsed:
raise ValueError(Errors.E156.format())
matches = find_matches(&self.patterns[0], self.patterns.size(), doc, matches = find_matches(&self.patterns[0], self.patterns.size(), doc,
extensions=self._extensions, extensions=self._extensions,
predicates=self._extra_predicates) predicates=self._extra_predicates)
@ -568,6 +579,8 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
# Signifier for 'any token' # Signifier for 'any token'
tokens.append((ONE, [(NULL_ATTR, 0)], [], [])) tokens.append((ONE, [(NULL_ATTR, 0)], [], []))
continue continue
if not isinstance(spec, dict):
raise ValueError(Errors.E154.format())
ops = _get_operators(spec) ops = _get_operators(spec)
attr_values = _get_attr_values(spec, string_store) attr_values = _get_attr_values(spec, string_store)
extensions = _get_extensions(spec, string_store, extensions_table) extensions = _get_extensions(spec, string_store, extensions_table)
@ -581,21 +594,29 @@ def _get_attr_values(spec, string_store):
attr_values = [] attr_values = []
for attr, value in spec.items(): for attr, value in spec.items():
if isinstance(attr, basestring): if isinstance(attr, basestring):
attr = attr.upper()
if attr == '_': if attr == '_':
continue continue
elif attr.upper() == "OP": elif attr == "OP":
continue continue
if attr.upper() == "TEXT": if attr == "TEXT":
attr = "ORTH" attr = "ORTH"
attr = IDS.get(attr.upper()) if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]:
raise ValueError(Errors.E152.format(attr=attr))
attr = IDS.get(attr)
if isinstance(value, basestring): if isinstance(value, basestring):
value = string_store.add(value) value = string_store.add(value)
elif isinstance(value, bool): elif isinstance(value, bool):
value = int(value) value = int(value)
elif isinstance(value, dict): elif isinstance(value, dict):
continue continue
else:
raise ValueError(Errors.E153.format(vtype=type(value).__name__))
if attr is not None: if attr is not None:
attr_values.append((attr, value)) attr_values.append((attr, value))
else:
# should be caught above using TOKEN_PATTERN_SCHEMA
raise ValueError(Errors.E152.format(attr=attr))
return attr_values return attr_values
@ -755,11 +776,13 @@ def _get_operators(spec):
return lookup[spec["OP"]] return lookup[spec["OP"]]
else: else:
keys = ", ".join(lookup.keys()) keys = ", ".join(lookup.keys())
raise KeyError(Errors.E011.format(op=spec["OP"], opts=keys)) raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
def _get_extensions(spec, string_store, name2index): def _get_extensions(spec, string_store, name2index):
attr_values = [] attr_values = []
if not isinstance(spec.get("_", {}), dict):
raise ValueError(Errors.E154.format())
for name, value in spec.get("_", {}).items(): for name, value in spec.get("_", {}).items():
if isinstance(value, dict): if isinstance(value, dict):
# Handle predicates (e.g. "IN", in the extra_predicates, not here. # Handle predicates (e.g. "IN", in the extra_predicates, not here.

View File

@ -12,6 +12,7 @@ from ..vocab cimport Vocab
from ..tokens.doc cimport Doc, get_token_attr from ..tokens.doc cimport Doc, get_token_attr
from ..typedefs cimport attr_t, hash_t from ..typedefs cimport attr_t, hash_t
from ._schemas import TOKEN_PATTERN_SCHEMA
from ..errors import Errors, Warnings, deprecation_warning, user_warning from ..errors import Errors, Warnings, deprecation_warning, user_warning
from ..attrs import FLAG61 as U_ENT from ..attrs import FLAG61 as U_ENT
from ..attrs import FLAG60 as B2_ENT from ..attrs import FLAG60 as B2_ENT
@ -62,6 +63,11 @@ cdef class PhraseMatcher:
if isinstance(attr, long): if isinstance(attr, long):
self.attr = attr self.attr = attr
else: else:
attr = attr.upper()
if attr == "TEXT":
attr = "ORTH"
if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]:
raise ValueError(Errors.E152.format(attr=attr))
self.attr = self.vocab.strings[attr] self.attr = self.vocab.strings[attr]
self.phrase_ids = PreshMap() self.phrase_ids = PreshMap()
abstract_patterns = [ abstract_patterns = [
@ -123,6 +129,10 @@ cdef class PhraseMatcher:
length = doc.length length = doc.length
if length == 0: if length == 0:
continue continue
if self.attr in (POS, TAG, LEMMA) and not doc.is_tagged:
raise ValueError(Errors.E155.format())
if self.attr == DEP and not doc.is_parsed:
raise ValueError(Errors.E156.format())
if self._validate and (doc.is_tagged or doc.is_parsed) \ if self._validate and (doc.is_tagged or doc.is_parsed) \
and self.attr not in (DEP, POS, TAG, LEMMA): and self.attr not in (DEP, POS, TAG, LEMMA):
string_attr = self.vocab.strings[self.attr] string_attr = self.vocab.strings[self.attr]

View File

@ -54,6 +54,8 @@ class EntityRuler(object):
self.phrase_patterns = defaultdict(list) self.phrase_patterns = defaultdict(list)
self.matcher = Matcher(nlp.vocab, validate=validate) self.matcher = Matcher(nlp.vocab, validate=validate)
if phrase_matcher_attr is not None: if phrase_matcher_attr is not None:
if phrase_matcher_attr.upper() == "TEXT":
phrase_matcher_attr = "ORTH"
self.phrase_matcher_attr = phrase_matcher_attr self.phrase_matcher_attr = phrase_matcher_attr
self.phrase_matcher = PhraseMatcher( self.phrase_matcher = PhraseMatcher(
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate nlp.vocab, attr=self.phrase_matcher_attr, validate=validate

View File

@ -10,8 +10,8 @@ from spacy.util import get_lang_class
# excluded: ja, ru, th, uk, vi, zh # excluded: ja, ru, th, uk, vi, zh
LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
"et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is", "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
"it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "rs", "si", "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
"sk", "sl", "sq", "sv", "ta", "te", "tl", "tr", "tt", "ur"] "sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur"]
# fmt: on # fmt: on

View File

@ -344,3 +344,39 @@ def test_dependency_matcher_compile(dependency_matcher):
# assert matches[0][1] == [[3, 1, 2]] # assert matches[0][1] == [[3, 1, 2]]
# assert matches[1][1] == [[4, 3, 3]] # assert matches[1][1] == [[4, 3, 3]]
# assert matches[2][1] == [[4, 3, 2]] # assert matches[2][1] == [[4, 3, 2]]
def test_attr_pipeline_checks(en_vocab):
doc1 = Doc(en_vocab, words=["Test"])
doc1.is_parsed = True
doc2 = Doc(en_vocab, words=["Test"])
doc2.is_tagged = True
doc3 = Doc(en_vocab, words=["Test"])
# DEP requires is_parsed
matcher = Matcher(en_vocab)
matcher.add("TEST", None, [{"DEP": "a"}])
matcher(doc1)
with pytest.raises(ValueError):
matcher(doc2)
with pytest.raises(ValueError):
matcher(doc3)
# TAG, POS, LEMMA require is_tagged
for attr in ("TAG", "POS", "LEMMA"):
matcher = Matcher(en_vocab)
matcher.add("TEST", None, [{attr: "a"}])
matcher(doc2)
with pytest.raises(ValueError):
matcher(doc1)
with pytest.raises(ValueError):
matcher(doc3)
# TEXT/ORTH only require tokens
matcher = Matcher(en_vocab)
matcher.add("TEST", None, [{"ORTH": "a"}])
matcher(doc1)
matcher(doc2)
matcher(doc3)
matcher = Matcher(en_vocab)
matcher.add("TEST", None, [{"TEXT": "a"}])
matcher(doc1)
matcher(doc2)
matcher(doc3)

View File

@ -7,6 +7,36 @@ from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA
from spacy.errors import MatchPatternError from spacy.errors import MatchPatternError
from spacy.util import get_json_validator, validate_json from spacy.util import get_json_validator, validate_json
# (pattern, num errors with validation, num errors identified with minimal
# checks)
TEST_PATTERNS = [
# Bad patterns flagged in all cases
([{"XX": "foo"}], 1, 1),
([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2, 1),
([{"IS_ALPHA": {"==": True}}, {"LIKE_NUM": None}], 2, 1),
([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
([{"IS_DIGIT": -1}], 1, 1),
([{"ORTH": -1}], 1, 1),
([{"_": "foo"}], 1, 1),
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
([1, 2, 3], 3, 1),
# Bad patterns flagged outside of Matcher
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1, 0),
# Bad patterns not flagged with minimal checks
([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2, 0),
([{"LENGTH": {"VALUE": 5}}], 1, 0),
([{"TEXT": {"VALUE": "foo"}}], 1, 0),
# Good patterns
([{"TEXT": "foo"}, {"LOWER": "bar"}], 0, 0),
([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0, 0),
([{"LIKE_NUM": True, "LENGTH": {">=": 5}}], 0, 0),
([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0, 0),
([{"NORM": "a"}, {"POS": {"IN": ["NOUN"]}}], 0, 0),
([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0, 0),
]
XFAIL_TEST_PATTERNS = [([{"orth": "foo"}], 0, 0)]
@pytest.fixture @pytest.fixture
def validator(): def validator():
@ -22,27 +52,24 @@ def test_matcher_pattern_validation(en_vocab, pattern):
matcher.add("TEST", None, pattern) matcher.add("TEST", None, pattern)
@pytest.mark.parametrize( @pytest.mark.parametrize("pattern,n_errors,_", TEST_PATTERNS)
"pattern,n_errors", def test_pattern_validation(validator, pattern, n_errors, _):
[
# Bad patterns
([{"XX": "foo"}], 1),
([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2),
([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2),
([{"IS_ALPHA": {"==": True}}, {"LIKE_NUM": None}], 2),
([{"TEXT": {"VALUE": "foo"}}], 1),
([{"LENGTH": {"VALUE": 5}}], 1),
([{"_": "foo"}], 1),
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1),
([{"IS_PUNCT": True, "OP": "$"}], 1),
# Good patterns
([{"TEXT": "foo"}, {"LOWER": "bar"}], 0),
([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0),
([{"LIKE_NUM": True, "LENGTH": {">=": 5}}], 0),
([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0),
([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0),
],
)
def test_pattern_validation(validator, pattern, n_errors):
errors = validate_json(pattern, validator) errors = validate_json(pattern, validator)
assert len(errors) == n_errors assert len(errors) == n_errors
@pytest.mark.xfail
@pytest.mark.parametrize("pattern,n_errors,_", XFAIL_TEST_PATTERNS)
def test_xfail_pattern_validation(validator, pattern, n_errors, _):
errors = validate_json(pattern, validator)
assert len(errors) == n_errors
@pytest.mark.parametrize("pattern,n_errors,n_min_errors", TEST_PATTERNS)
def test_minimal_pattern_validation(en_vocab, pattern, n_errors, n_min_errors):
matcher = Matcher(en_vocab)
if n_min_errors > 0:
with pytest.raises(ValueError):
matcher.add("TEST", None, pattern)
elif n_errors == 0:
matcher.add("TEST", None, pattern)

View File

@ -99,3 +99,36 @@ def test_phrase_matcher_validation(en_vocab):
with pytest.warns(None) as record: with pytest.warns(None) as record:
matcher.add("TEST4", None, doc2) matcher.add("TEST4", None, doc2)
assert not record.list assert not record.list
def test_attr_validation(en_vocab):
with pytest.raises(ValueError):
PhraseMatcher(en_vocab, attr="UNSUPPORTED")
def test_attr_pipeline_checks(en_vocab):
doc1 = Doc(en_vocab, words=["Test"])
doc1.is_parsed = True
doc2 = Doc(en_vocab, words=["Test"])
doc2.is_tagged = True
doc3 = Doc(en_vocab, words=["Test"])
# DEP requires is_parsed
matcher = PhraseMatcher(en_vocab, attr="DEP")
matcher.add("TEST1", None, doc1)
with pytest.raises(ValueError):
matcher.add("TEST2", None, doc2)
with pytest.raises(ValueError):
matcher.add("TEST3", None, doc3)
# TAG, POS, LEMMA require is_tagged
for attr in ("TAG", "POS", "LEMMA"):
matcher = PhraseMatcher(en_vocab, attr=attr)
matcher.add("TEST2", None, doc2)
with pytest.raises(ValueError):
matcher.add("TEST1", None, doc1)
with pytest.raises(ValueError):
matcher.add("TEST3", None, doc3)
# TEXT/ORTH only require tokens
matcher = PhraseMatcher(en_vocab, attr="ORTH")
matcher.add("TEST3", None, doc3)
matcher = PhraseMatcher(en_vocab, attr="TEXT")
matcher.add("TEST3", None, doc3)

View File

@ -137,7 +137,8 @@ def test_entity_ruler_validate(nlp):
valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]} valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]}
invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]} invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]}
# invalid pattern is added without errors without validate # invalid pattern raises error without validate
with pytest.raises(ValueError):
ruler.add_patterns([invalid_pattern]) ruler.add_patterns([invalid_pattern])
# valid pattern is added without errors with validate # valid pattern is added without errors with validate

View File

@ -859,12 +859,12 @@ token pattern covering the exact tokenization of the term.
<Infobox title="Important note on creating patterns" variant="warning"> <Infobox title="Important note on creating patterns" variant="warning">
To create the patterns, each phrase has to be processed with the `nlp` object. To create the patterns, each phrase has to be processed with the `nlp` object.
If you have a mode loaded, doing this in a loop or list comprehension can easily If you have a model loaded, doing this in a loop or list comprehension can
become inefficient and slow. If you **only need the tokenization and lexical easily become inefficient and slow. If you **only need the tokenization and
attributes**, you can run [`nlp.make_doc`](/api/language#make_doc) instead, lexical attributes**, you can run [`nlp.make_doc`](/api/language#make_doc)
which will only run the tokenizer. For an additional speed boost, you can also instead, which will only run the tokenizer. For an additional speed boost, you
use the [`nlp.tokenizer.pipe`](/api/tokenizer#pipe) method, which will process can also use the [`nlp.tokenizer.pipe`](/api/tokenizer#pipe) method, which will
the texts as a stream. process the texts as a stream.
```diff ```diff
- patterns = [nlp(term) for term in LOTS_OF_TERMS] - patterns = [nlp(term) for term in LOTS_OF_TERMS]

View File

@ -127,7 +127,7 @@
{ "code": "is", "name": "Icelandic" }, { "code": "is", "name": "Icelandic" },
{ "code": "lt", "name": "Lithuanian" }, { "code": "lt", "name": "Lithuanian" },
{ "code": "lv", "name": "Latvian" }, { "code": "lv", "name": "Latvian" },
{ "code": "rs", "name": "Serbian" }, { "code": "sr", "name": "Serbian" },
{ "code": "sk", "name": "Slovak" }, { "code": "sk", "name": "Slovak" },
{ "code": "sl", "name": "Slovenian" }, { "code": "sl", "name": "Slovenian" },
{ {

View File

@ -1089,6 +1089,62 @@
"youtube": "6zm9NC9uRkk", "youtube": "6zm9NC9uRkk",
"category": ["videos"] "category": ["videos"]
}, },
{
"type": "education",
"id": "video-intro-to-nlp-episode-1",
"title": "Intro to NLP with spaCy",
"slogan": "Episode 1: Data exploration",
"description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.",
"author": "Vincent Warmerdam",
"author_links": {
"twitter": "fishnets88",
"github": "koaning"
},
"youtube": "WnGPv6HnBok",
"category": ["videos"]
},
{
"type": "education",
"id": "video-spacy-irl-entity-linking",
"title": "Entity Linking functionality in spaCy",
"slogan": "spaCy IRL 2019",
"url": "https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc",
"author": "Sofie Van Landeghem",
"author_links": {
"twitter": "OxyKodit",
"github": "svlandeg"
},
"youtube": "PW3RJM8tDGo",
"category": ["videos"]
},
{
"type": "education",
"id": "video-spacy-irl-lemmatization",
"title": "Rethinking rule-based lemmatization",
"slogan": "spaCy IRL 2019",
"url": "https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc",
"author": "Guadalupe Romero",
"author_links": {
"twitter": "_guadiromero",
"github": "guadi1994"
},
"youtube": "88zcQODyuko",
"category": ["videos"]
},
{
"type": "education",
"id": "video-spacy-irl-scispacy",
"title": "ScispaCy: A spaCy pipeline & models for scientific & biomedical text",
"slogan": "spaCy IRL 2019",
"url": "https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc",
"author": "Mark Neumann",
"author_links": {
"twitter": "MarkNeumannnn",
"github": "DeNeutoy"
},
"youtube": "2_HSKDALwuw",
"category": ["videos"]
},
{ {
"type": "education", "type": "education",
"id": "podcast-nlp-highlights", "id": "podcast-nlp-highlights",

View File

@ -86,6 +86,7 @@ const UniverseContent = ({ content = [], categories, pageContext, location, mdxC
<img <img
src={`https://img.youtube.com/vi/${youtube}/0.jpg`} src={`https://img.youtube.com/vi/${youtube}/0.jpg`}
alt="" alt=""
style={{ clipPath: 'inset(12.5% 0)' }}
/> />
) )
return cover ? ( return cover ? (