From 8fe7bdd0fa81c590e01eb99a35f195c835fa93d7 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 21 Aug 2019 14:00:37 +0200 Subject: [PATCH 1/4] Improve token pattern checking without validation (#4105) * Fix typo in rule-based matching docs * Improve token pattern checking without validation Add more detailed token pattern checks without full JSON pattern validation and provide more detailed error messages. Addresses #4070 (also related: #4063, #4100). * Check whether top-level attributes in patterns and attr for PhraseMatcher are in token pattern schema * Check whether attribute value types are supported in general (as opposed to per attribute with full validation) * Report various internal error types (OverflowError, AttributeError, KeyError) as ValueError with standard error messages * Check for tagger/parser in PhraseMatcher pipeline for attributes TAG, POS, LEMMA, and DEP * Add error messages with relevant details on how to use validate=True or nlp() instead of nlp.make_doc() * Support attr=TEXT for PhraseMatcher * Add NORM to schema * Expand tests for pattern validation, Matcher, PhraseMatcher, and EntityRuler * Remove unnecessary .keys() * Rephrase error messages * Add another type check to Matcher Add another type check to Matcher for more understandable error messages in some rare cases. * Support phrase_matcher_attr=TEXT for EntityRuler * Don't use spacy.errors in examples and bin scripts * Fix error code * Auto-format Also try get Azure pipelines to finally start a build :( * Update errors.py Co-authored-by: Ines Montani Co-authored-by: Matthew Honnibal --- bin/wiki_entity_linking/kb_creator.py | 7 +- .../wikidata_pretrain_kb.py | 5 +- .../wikidata_train_entity_linker.py | 9 +-- examples/training/pretrain_kb.py | 4 +- examples/training/train_entity_linker.py | 2 - spacy/displacy/render.py | 2 +- spacy/errors.py | 25 +++++-- spacy/matcher/_schemas.py | 4 ++ spacy/matcher/matcher.pyx | 29 ++++++-- spacy/matcher/phrasematcher.pyx | 10 +++ spacy/pipeline/entityruler.py | 2 + .../tests/matcher/test_pattern_validation.py | 71 +++++++++++++------ spacy/tests/matcher/test_phrase_matcher.py | 33 +++++++++ spacy/tests/pipeline/test_entity_ruler.py | 5 +- website/docs/usage/rule-based-matching.md | 12 ++-- 15 files changed, 162 insertions(+), 58 deletions(-) diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py index d88cf9c7e..bd862f536 100644 --- a/bin/wiki_entity_linking/kb_creator.py +++ b/bin/wiki_entity_linking/kb_creator.py @@ -8,8 +8,6 @@ from spacy.kb import KnowledgeBase import csv import datetime -from spacy import Errors - def create_kb( nlp, @@ -33,7 +31,10 @@ def create_kb( input_dim = nlp.vocab.vectors_length print("Loaded pre-trained vectors of size %s" % input_dim) else: - raise ValueError(Errors.E155) + raise ValueError( + "The `nlp` object should have access to pre-trained word vectors, " + " cf. https://spacy.io/usage/models#languages." + ) # disable this part of the pipeline when rerunning the KB generation from preprocessed files if read_raw_data: diff --git a/bin/wiki_entity_linking/wikidata_pretrain_kb.py b/bin/wiki_entity_linking/wikidata_pretrain_kb.py index c9ca470cf..c5261cada 100644 --- a/bin/wiki_entity_linking/wikidata_pretrain_kb.py +++ b/bin/wiki_entity_linking/wikidata_pretrain_kb.py @@ -73,7 +73,10 @@ def main( # check the length of the nlp vectors if "vectors" not in nlp.meta or not nlp.vocab.vectors.size: - raise ValueError(Errors.E155) + raise ValueError( + "The `nlp` object should have access to pre-trained word vectors, " + " cf. https://spacy.io/usage/models#languages." + ) # STEP 2: create prior probabilities from WP print() diff --git a/bin/wiki_entity_linking/wikidata_train_entity_linker.py b/bin/wiki_entity_linking/wikidata_train_entity_linker.py index a988b8f69..770919112 100644 --- a/bin/wiki_entity_linking/wikidata_train_entity_linker.py +++ b/bin/wiki_entity_linking/wikidata_train_entity_linker.py @@ -19,8 +19,6 @@ from bin.wiki_entity_linking import training_set_creator import spacy from spacy.kb import KnowledgeBase - -from spacy import Errors from spacy.util import minibatch, compounding @@ -68,7 +66,7 @@ def main( # check that there is a NER component in the pipeline if "ner" not in nlp.pipe_names: - raise ValueError(Errors.E152) + raise ValueError("The `nlp` object should have a pre-trained `ner` component.") # STEP 2 : read the KB print() @@ -82,7 +80,10 @@ def main( print(now(), "STEP 3: reading training dataset from", loc_training) else: if not wp_xml: - raise ValueError(Errors.E153) + raise ValueError( + "Either provide a path to a preprocessed training directory, " + "or to the original Wikipedia XML dump." + ) if output_dir: loc_training = output_dir / "training_data" diff --git a/examples/training/pretrain_kb.py b/examples/training/pretrain_kb.py index 67cc1587d..d5281ad42 100644 --- a/examples/training/pretrain_kb.py +++ b/examples/training/pretrain_kb.py @@ -17,12 +17,10 @@ import plac from pathlib import Path from spacy.vocab import Vocab - import spacy from spacy.kb import KnowledgeBase from bin.wiki_entity_linking.train_descriptions import EntityEncoder -from spacy import Errors # Q2146908 (Russ Cochran): American golfer @@ -45,7 +43,7 @@ def main(vocab_path=None, model=None, output_dir=None, n_iter=50): If an output_dir is provided, the KB will be stored there in a file 'kb'. When providing an nlp model, the updated vocab will also be written to a directory in the output_dir.""" if model is None and vocab_path is None: - raise ValueError(Errors.E154) + raise ValueError("Either the `nlp` model or the `vocab` should be specified.") if model is not None: nlp = spacy.load(model) # load existing spaCy model diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py index f925672f1..12ed531a6 100644 --- a/examples/training/train_entity_linker.py +++ b/examples/training/train_entity_linker.py @@ -22,8 +22,6 @@ from spacy.vocab import Vocab import spacy from spacy.kb import KnowledgeBase - -from spacy import Errors from spacy.tokens import Span from spacy.util import minibatch, compounding diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index fb25b60e2..23868c8bd 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -128,7 +128,7 @@ class DependencyRenderer(object): """ if start < 0 or end < 0: error_args = dict(start=start, end=end, label=label, dir=direction) - raise ValueError(Errors.E156.format(**error_args)) + raise ValueError(Errors.E157.format(**error_args)) level = self.levels.index(end - start) + 1 x_start = self.offset_x + start * self.distance + self.arrow_spacing if self.direction == "rtl": diff --git a/spacy/errors.py b/spacy/errors.py index 2a5150e8e..7811374a7 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -431,13 +431,24 @@ class Errors(object): "same, but found '{nlp}' and '{vocab}' respectively.") E151 = ("Trying to call nlp.update without required annotation types. " "Expected top-level keys: {exp}. Got: {unexp}.") - E152 = ("The `nlp` object should have a pre-trained `ner` component.") - E153 = ("Either provide a path to a preprocessed training directory, " - "or to the original Wikipedia XML dump.") - E154 = ("Either the `nlp` model or the `vocab` should be specified.") - E155 = ("The `nlp` object should have access to pre-trained word vectors, " - " cf. https://spacy.io/usage/models#languages.") - E156 = ("Can't render negative values for dependency arc start or end. " + E152 = ("The attribute {attr} is not supported for token patterns. " + "Please use the option validate=True with Matcher, PhraseMatcher, " + "or EntityRuler for more details.") + E153 = ("The value type {vtype} is not supported for token patterns. " + "Please use the option validate=True with Matcher, PhraseMatcher, " + "or EntityRuler for more details.") + E154 = ("One of the attributes or values is not supported for token " + "patterns. Please use the option validate=True with Matcher, " + "PhraseMatcher, or EntityRuler for more details.") + E155 = ("The pipeline needs to include a tagger in order to use " + "PhraseMatcher with the attributes POS, TAG, or LEMMA. Try using " + "nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of " + "list(nlp.tokenizer.pipe()).") + E156 = ("The pipeline needs to include a parser in order to use " + "PhraseMatcher with the attribute DEP. Try using " + "nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of " + "list(nlp.tokenizer.pipe()).") + E157 = ("Can't render negative values for dependency arc start or end. " "Make sure that you're passing in absolute token indices, not " "relative token offsets.\nstart: {start}, end: {end}, label: " "{label}, direction: {dir}") diff --git a/spacy/matcher/_schemas.py b/spacy/matcher/_schemas.py index f4dfebb6b..4f4992781 100644 --- a/spacy/matcher/_schemas.py +++ b/spacy/matcher/_schemas.py @@ -102,6 +102,10 @@ TOKEN_PATTERN_SCHEMA = { "title": "Entity label of single token", "$ref": "#/definitions/string_value", }, + "NORM": { + "title": "Normalized form of the token text", + "$ref": "#/definitions/string_value", + }, "LENGTH": { "title": "Token character length", "$ref": "#/definitions/integer_value", diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 86658ce99..3226dd124 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -112,9 +112,12 @@ cdef class Matcher: raise MatchPatternError(key, errors) key = self._normalize_key(key) for pattern in patterns: - specs = _preprocess_pattern(pattern, self.vocab.strings, - self._extensions, self._extra_predicates) - self.patterns.push_back(init_pattern(self.mem, key, specs)) + try: + specs = _preprocess_pattern(pattern, self.vocab.strings, + self._extensions, self._extra_predicates) + self.patterns.push_back(init_pattern(self.mem, key, specs)) + except OverflowError, AttributeError: + raise ValueError(Errors.E154.format()) self._patterns.setdefault(key, []) self._callbacks[key] = on_match self._patterns[key].extend(patterns) @@ -568,6 +571,8 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi # Signifier for 'any token' tokens.append((ONE, [(NULL_ATTR, 0)], [], [])) continue + if not isinstance(spec, dict): + raise ValueError(Errors.E154.format()) ops = _get_operators(spec) attr_values = _get_attr_values(spec, string_store) extensions = _get_extensions(spec, string_store, extensions_table) @@ -581,21 +586,29 @@ def _get_attr_values(spec, string_store): attr_values = [] for attr, value in spec.items(): if isinstance(attr, basestring): + attr = attr.upper() if attr == '_': continue - elif attr.upper() == "OP": + elif attr == "OP": continue - if attr.upper() == "TEXT": + if attr == "TEXT": attr = "ORTH" - attr = IDS.get(attr.upper()) + if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]: + raise ValueError(Errors.E152.format(attr=attr)) + attr = IDS.get(attr) if isinstance(value, basestring): value = string_store.add(value) elif isinstance(value, bool): value = int(value) elif isinstance(value, dict): continue + else: + raise ValueError(Errors.E153.format(vtype=type(value).__name__)) if attr is not None: attr_values.append((attr, value)) + else: + # should be caught above using TOKEN_PATTERN_SCHEMA + raise ValueError(Errors.E152.format(attr=attr)) return attr_values @@ -755,11 +768,13 @@ def _get_operators(spec): return lookup[spec["OP"]] else: keys = ", ".join(lookup.keys()) - raise KeyError(Errors.E011.format(op=spec["OP"], opts=keys)) + raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys)) def _get_extensions(spec, string_store, name2index): attr_values = [] + if not isinstance(spec.get("_", {}), dict): + raise ValueError(Errors.E154.format()) for name, value in spec.get("_", {}).items(): if isinstance(value, dict): # Handle predicates (e.g. "IN", in the extra_predicates, not here. diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 68821a085..3a8bec2df 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -12,6 +12,7 @@ from ..vocab cimport Vocab from ..tokens.doc cimport Doc, get_token_attr from ..typedefs cimport attr_t, hash_t +from ._schemas import TOKEN_PATTERN_SCHEMA from ..errors import Errors, Warnings, deprecation_warning, user_warning from ..attrs import FLAG61 as U_ENT from ..attrs import FLAG60 as B2_ENT @@ -62,6 +63,11 @@ cdef class PhraseMatcher: if isinstance(attr, long): self.attr = attr else: + attr = attr.upper() + if attr == "TEXT": + attr = "ORTH" + if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]: + raise ValueError(Errors.E152.format(attr=attr)) self.attr = self.vocab.strings[attr] self.phrase_ids = PreshMap() abstract_patterns = [ @@ -123,6 +129,10 @@ cdef class PhraseMatcher: length = doc.length if length == 0: continue + if self.attr in (POS, TAG, LEMMA) and not doc.is_tagged: + raise ValueError(Errors.E155.format()) + if self.attr == DEP and not doc.is_parsed: + raise ValueError(Errors.E156.format()) if self._validate and (doc.is_tagged or doc.is_parsed) \ and self.attr not in (DEP, POS, TAG, LEMMA): string_attr = self.vocab.strings[self.attr] diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 23c8c91ba..3f6f8811c 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -54,6 +54,8 @@ class EntityRuler(object): self.phrase_patterns = defaultdict(list) self.matcher = Matcher(nlp.vocab, validate=validate) if phrase_matcher_attr is not None: + if phrase_matcher_attr.upper() == "TEXT": + phrase_matcher_attr = "ORTH" self.phrase_matcher_attr = phrase_matcher_attr self.phrase_matcher = PhraseMatcher( nlp.vocab, attr=self.phrase_matcher_attr, validate=validate diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py index a0a8d3a14..665bcf935 100644 --- a/spacy/tests/matcher/test_pattern_validation.py +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -7,6 +7,36 @@ from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA from spacy.errors import MatchPatternError from spacy.util import get_json_validator, validate_json +# (pattern, num errors with validation, num errors identified with minimal +# checks) +TEST_PATTERNS = [ + # Bad patterns flagged in all cases + ([{"XX": "foo"}], 1, 1), + ([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2, 1), + ([{"IS_ALPHA": {"==": True}}, {"LIKE_NUM": None}], 2, 1), + ([{"IS_PUNCT": True, "OP": "$"}], 1, 1), + ([{"IS_DIGIT": -1}], 1, 1), + ([{"ORTH": -1}], 1, 1), + ([{"_": "foo"}], 1, 1), + ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1), + ([1, 2, 3], 3, 1), + # Bad patterns flagged outside of Matcher + ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1, 0), + # Bad patterns not flagged with minimal checks + ([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2, 0), + ([{"LENGTH": {"VALUE": 5}}], 1, 0), + ([{"TEXT": {"VALUE": "foo"}}], 1, 0), + # Good patterns + ([{"TEXT": "foo"}, {"LOWER": "bar"}], 0, 0), + ([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0, 0), + ([{"LIKE_NUM": True, "LENGTH": {">=": 5}}], 0, 0), + ([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0, 0), + ([{"NORM": "a"}, {"POS": {"IN": ["NOUN"]}}], 0, 0), + ([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0, 0), +] + +XFAIL_TEST_PATTERNS = [([{"orth": "foo"}], 0, 0)] + @pytest.fixture def validator(): @@ -22,27 +52,24 @@ def test_matcher_pattern_validation(en_vocab, pattern): matcher.add("TEST", None, pattern) -@pytest.mark.parametrize( - "pattern,n_errors", - [ - # Bad patterns - ([{"XX": "foo"}], 1), - ([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2), - ([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2), - ([{"IS_ALPHA": {"==": True}}, {"LIKE_NUM": None}], 2), - ([{"TEXT": {"VALUE": "foo"}}], 1), - ([{"LENGTH": {"VALUE": 5}}], 1), - ([{"_": "foo"}], 1), - ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1), - ([{"IS_PUNCT": True, "OP": "$"}], 1), - # Good patterns - ([{"TEXT": "foo"}, {"LOWER": "bar"}], 0), - ([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0), - ([{"LIKE_NUM": True, "LENGTH": {">=": 5}}], 0), - ([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0), - ([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0), - ], -) -def test_pattern_validation(validator, pattern, n_errors): +@pytest.mark.parametrize("pattern,n_errors,_", TEST_PATTERNS) +def test_pattern_validation(validator, pattern, n_errors, _): errors = validate_json(pattern, validator) assert len(errors) == n_errors + + +@pytest.mark.xfail +@pytest.mark.parametrize("pattern,n_errors,_", XFAIL_TEST_PATTERNS) +def test_xfail_pattern_validation(validator, pattern, n_errors, _): + errors = validate_json(pattern, validator) + assert len(errors) == n_errors + + +@pytest.mark.parametrize("pattern,n_errors,n_min_errors", TEST_PATTERNS) +def test_minimal_pattern_validation(en_vocab, pattern, n_errors, n_min_errors): + matcher = Matcher(en_vocab) + if n_min_errors > 0: + with pytest.raises(ValueError): + matcher.add("TEST", None, pattern) + elif n_errors == 0: + matcher.add("TEST", None, pattern) diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index a5fdb345e..b82f9a058 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -99,3 +99,36 @@ def test_phrase_matcher_validation(en_vocab): with pytest.warns(None) as record: matcher.add("TEST4", None, doc2) assert not record.list + + +def test_attr_validation(en_vocab): + with pytest.raises(ValueError): + PhraseMatcher(en_vocab, attr="UNSUPPORTED") + + +def test_attr_pipeline_checks(en_vocab): + doc1 = Doc(en_vocab, words=["Test"]) + doc1.is_parsed = True + doc2 = Doc(en_vocab, words=["Test"]) + doc2.is_tagged = True + doc3 = Doc(en_vocab, words=["Test"]) + # DEP requires is_parsed + matcher = PhraseMatcher(en_vocab, attr="DEP") + matcher.add("TEST1", None, doc1) + with pytest.raises(ValueError): + matcher.add("TEST2", None, doc2) + with pytest.raises(ValueError): + matcher.add("TEST3", None, doc3) + # TAG, POS, LEMMA require is_tagged + for attr in ("TAG", "POS", "LEMMA"): + matcher = PhraseMatcher(en_vocab, attr=attr) + matcher.add("TEST2", None, doc2) + with pytest.raises(ValueError): + matcher.add("TEST1", None, doc1) + with pytest.raises(ValueError): + matcher.add("TEST3", None, doc3) + # TEXT/ORTH only require tokens + matcher = PhraseMatcher(en_vocab, attr="ORTH") + matcher.add("TEST3", None, doc3) + matcher = PhraseMatcher(en_vocab, attr="TEXT") + matcher.add("TEST3", None, doc3) diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index 57e980ec3..660ad3b28 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -137,8 +137,9 @@ def test_entity_ruler_validate(nlp): valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]} invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]} - # invalid pattern is added without errors without validate - ruler.add_patterns([invalid_pattern]) + # invalid pattern raises error without validate + with pytest.raises(ValueError): + ruler.add_patterns([invalid_pattern]) # valid pattern is added without errors with validate validated_ruler.add_patterns([valid_pattern]) diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index fbe8bdc67..1d67625a5 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -859,12 +859,12 @@ token pattern covering the exact tokenization of the term. To create the patterns, each phrase has to be processed with the `nlp` object. -If you have a mode loaded, doing this in a loop or list comprehension can easily -become inefficient and slow. If you **only need the tokenization and lexical -attributes**, you can run [`nlp.make_doc`](/api/language#make_doc) instead, -which will only run the tokenizer. For an additional speed boost, you can also -use the [`nlp.tokenizer.pipe`](/api/tokenizer#pipe) method, which will process -the texts as a stream. +If you have a model loaded, doing this in a loop or list comprehension can +easily become inefficient and slow. If you **only need the tokenization and +lexical attributes**, you can run [`nlp.make_doc`](/api/language#make_doc) +instead, which will only run the tokenizer. For an additional speed boost, you +can also use the [`nlp.tokenizer.pipe`](/api/tokenizer#pipe) method, which will +process the texts as a stream. ```diff - patterns = [nlp(term) for term in LOTS_OF_TERMS] From 4fe9329bfba009d626e1a442eeaeb094961a5817 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavle=20Vidanovi=C4=87?= Date: Wed, 21 Aug 2019 19:57:37 +0200 Subject: [PATCH 2/4] Serbian language code update "rs" -> "sr" (#4159) * Serbian stopwords added. (cyrillic alphabet) * spaCy Contribution agreement included. * Test initialize updated * Serbian language code update. --bugfix --- spacy/lang/{rs => sr}/__init__.py | 4 ++-- spacy/lang/{rs => sr}/stop_words.py | 0 spacy/tests/lang/test_initialize.py | 4 ++-- website/meta/languages.json | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) rename spacy/lang/{rs => sr}/__init__.py (91%) rename spacy/lang/{rs => sr}/stop_words.py (100%) diff --git a/spacy/lang/rs/__init__.py b/spacy/lang/sr/__init__.py similarity index 91% rename from spacy/lang/rs/__init__.py rename to spacy/lang/sr/__init__.py index 4228fb193..99090be3b 100644 --- a/spacy/lang/rs/__init__.py +++ b/spacy/lang/sr/__init__.py @@ -12,7 +12,7 @@ from ...util import update_exc, add_lookups class SerbianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: "rs" + lex_attr_getters[LANG] = lambda text: "sr" lex_attr_getters[NORM] = add_lookups( Language.Defaults.lex_attr_getters[NORM], BASE_NORMS ) @@ -21,7 +21,7 @@ class SerbianDefaults(Language.Defaults): class Serbian(Language): - lang = "rs" + lang = "sr" Defaults = SerbianDefaults diff --git a/spacy/lang/rs/stop_words.py b/spacy/lang/sr/stop_words.py similarity index 100% rename from spacy/lang/rs/stop_words.py rename to spacy/lang/sr/stop_words.py diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index f6938c252..9b01340e3 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -10,8 +10,8 @@ from spacy.util import get_lang_class # excluded: ja, ru, th, uk, vi, zh LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is", - "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "rs", "si", - "sk", "sl", "sq", "sv", "ta", "te", "tl", "tr", "tt", "ur"] + "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk", + "sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur"] # fmt: on diff --git a/website/meta/languages.json b/website/meta/languages.json index 8f91698ec..0a54efead 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -127,7 +127,7 @@ { "code": "is", "name": "Icelandic" }, { "code": "lt", "name": "Lithuanian" }, { "code": "lv", "name": "Latvian" }, - { "code": "rs", "name": "Serbian" }, + { "code": "sr", "name": "Serbian" }, { "code": "sk", "name": "Slovak" }, { "code": "sl", "name": "Slovenian" }, { From 2d17b047e22be4500bbd2f83898fbca2168917f4 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 21 Aug 2019 20:52:36 +0200 Subject: [PATCH 3/4] Check for is_tagged/is_parsed for Matcher attrs (#4163) Check for relevant components in the pipeline when Matcher is called, similar to the checks for PhraseMatcher in #4105. * keep track of attributes seen in patterns * when Matcher is called on a Doc, check for is_tagged for LEMMA, TAG, POS and for is_parsed for DEP --- spacy/errors.py | 8 +++--- spacy/matcher/matcher.pxd | 1 + spacy/matcher/matcher.pyx | 12 +++++++-- spacy/tests/matcher/test_matcher_api.py | 36 +++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 6 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 7811374a7..489f70ca7 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -441,11 +441,11 @@ class Errors(object): "patterns. Please use the option validate=True with Matcher, " "PhraseMatcher, or EntityRuler for more details.") E155 = ("The pipeline needs to include a tagger in order to use " - "PhraseMatcher with the attributes POS, TAG, or LEMMA. Try using " - "nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of " - "list(nlp.tokenizer.pipe()).") + "Matcher or PhraseMatcher with the attributes POS, TAG, or LEMMA. " + "Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) " + "instead of list(nlp.tokenizer.pipe()).") E156 = ("The pipeline needs to include a parser in order to use " - "PhraseMatcher with the attribute DEP. Try using " + "Matcher or PhraseMatcher with the attribute DEP. Try using " "nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of " "list(nlp.tokenizer.pipe()).") E157 = ("Can't render negative values for dependency arc start or end. " diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd index 2e46cb376..a146cd107 100644 --- a/spacy/matcher/matcher.pxd +++ b/spacy/matcher/matcher.pxd @@ -67,3 +67,4 @@ cdef class Matcher: cdef public object _callbacks cdef public object _extensions cdef public object _extra_predicates + cdef public object _seen_attrs diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 3226dd124..a79542929 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -15,7 +15,7 @@ from ..structs cimport TokenC from ..vocab cimport Vocab from ..tokens.doc cimport Doc, get_token_attr from ..tokens.token cimport Token -from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH +from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA from ._schemas import TOKEN_PATTERN_SCHEMA from ..util import get_json_validator, validate_json @@ -45,7 +45,7 @@ cdef class Matcher: self._patterns = {} self._callbacks = {} self._extensions = {} - self._extra_predicates = [] + self._seen_attrs = set() self.vocab = vocab self.mem = Pool() if validate: @@ -116,6 +116,9 @@ cdef class Matcher: specs = _preprocess_pattern(pattern, self.vocab.strings, self._extensions, self._extra_predicates) self.patterns.push_back(init_pattern(self.mem, key, specs)) + for spec in specs: + for attr, _ in spec[1]: + self._seen_attrs.add(attr) except OverflowError, AttributeError: raise ValueError(Errors.E154.format()) self._patterns.setdefault(key, []) @@ -180,6 +183,11 @@ cdef class Matcher: describing the matches. A match tuple describes a span `doc[start:end]`. The `label_id` and `key` are both integers. """ + if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \ + and not doc.is_tagged: + raise ValueError(Errors.E155.format()) + if DEP in self._seen_attrs and not doc.is_parsed: + raise ValueError(Errors.E156.format()) matches = find_matches(&self.patterns[0], self.patterns.size(), doc, extensions=self._extensions, predicates=self._extra_predicates) diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 013700d52..401b7c928 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -344,3 +344,39 @@ def test_dependency_matcher_compile(dependency_matcher): # assert matches[0][1] == [[3, 1, 2]] # assert matches[1][1] == [[4, 3, 3]] # assert matches[2][1] == [[4, 3, 2]] + + +def test_attr_pipeline_checks(en_vocab): + doc1 = Doc(en_vocab, words=["Test"]) + doc1.is_parsed = True + doc2 = Doc(en_vocab, words=["Test"]) + doc2.is_tagged = True + doc3 = Doc(en_vocab, words=["Test"]) + # DEP requires is_parsed + matcher = Matcher(en_vocab) + matcher.add("TEST", None, [{"DEP": "a"}]) + matcher(doc1) + with pytest.raises(ValueError): + matcher(doc2) + with pytest.raises(ValueError): + matcher(doc3) + # TAG, POS, LEMMA require is_tagged + for attr in ("TAG", "POS", "LEMMA"): + matcher = Matcher(en_vocab) + matcher.add("TEST", None, [{attr: "a"}]) + matcher(doc2) + with pytest.raises(ValueError): + matcher(doc1) + with pytest.raises(ValueError): + matcher(doc3) + # TEXT/ORTH only require tokens + matcher = Matcher(en_vocab) + matcher.add("TEST", None, [{"ORTH": "a"}]) + matcher(doc1) + matcher(doc2) + matcher(doc3) + matcher = Matcher(en_vocab) + matcher.add("TEST", None, [{"TEXT": "a"}]) + matcher(doc1) + matcher(doc2) + matcher(doc3) From b072c13017b8026d6767b6228863241895718bb0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 21 Aug 2019 21:35:37 +0200 Subject: [PATCH 4/4] Update universe with videos [ci skip] --- website/meta/universe.json | 56 +++++++++++++++++++++++++++++++ website/src/templates/universe.js | 1 + 2 files changed, 57 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index c66915e8e..c3eee3901 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1089,6 +1089,62 @@ "youtube": "6zm9NC9uRkk", "category": ["videos"] }, + { + "type": "education", + "id": "video-intro-to-nlp-episode-1", + "title": "Intro to NLP with spaCy", + "slogan": "Episode 1: Data exploration", + "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.", + "author": "Vincent Warmerdam", + "author_links": { + "twitter": "fishnets88", + "github": "koaning" + }, + "youtube": "WnGPv6HnBok", + "category": ["videos"] + }, + { + "type": "education", + "id": "video-spacy-irl-entity-linking", + "title": "Entity Linking functionality in spaCy", + "slogan": "spaCy IRL 2019", + "url": "https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc", + "author": "Sofie Van Landeghem", + "author_links": { + "twitter": "OxyKodit", + "github": "svlandeg" + }, + "youtube": "PW3RJM8tDGo", + "category": ["videos"] + }, + { + "type": "education", + "id": "video-spacy-irl-lemmatization", + "title": "Rethinking rule-based lemmatization", + "slogan": "spaCy IRL 2019", + "url": "https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc", + "author": "Guadalupe Romero", + "author_links": { + "twitter": "_guadiromero", + "github": "guadi1994" + }, + "youtube": "88zcQODyuko", + "category": ["videos"] + }, + { + "type": "education", + "id": "video-spacy-irl-scispacy", + "title": "ScispaCy: A spaCy pipeline & models for scientific & biomedical text", + "slogan": "spaCy IRL 2019", + "url": "https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc", + "author": "Mark Neumann", + "author_links": { + "twitter": "MarkNeumannnn", + "github": "DeNeutoy" + }, + "youtube": "2_HSKDALwuw", + "category": ["videos"] + }, { "type": "education", "id": "podcast-nlp-highlights", diff --git a/website/src/templates/universe.js b/website/src/templates/universe.js index 379b1a541..e49e81b01 100644 --- a/website/src/templates/universe.js +++ b/website/src/templates/universe.js @@ -86,6 +86,7 @@ const UniverseContent = ({ content = [], categories, pageContext, location, mdxC ) return cover ? (