From 8fe7bdd0fa81c590e01eb99a35f195c835fa93d7 Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Wed, 21 Aug 2019 14:00:37 +0200
Subject: [PATCH 1/4]  Improve token pattern checking without validation 
 (#4105)

* Fix typo in rule-based matching docs

* Improve token pattern checking without validation

Add more detailed token pattern checks without full JSON pattern validation and
provide more detailed error messages.

Addresses #4070 (also related: #4063, #4100).

* Check whether top-level attributes in patterns and attr for PhraseMatcher are
  in token pattern schema

* Check whether attribute value types are supported in general (as opposed to
  per attribute with full validation)

* Report various internal error types (OverflowError, AttributeError, KeyError)
  as ValueError with standard error messages

* Check for tagger/parser in PhraseMatcher pipeline for attributes TAG, POS,
  LEMMA, and DEP

* Add error messages with relevant details on how to use validate=True or nlp()
  instead of nlp.make_doc()

* Support attr=TEXT for PhraseMatcher

* Add NORM to schema

* Expand tests for pattern validation, Matcher, PhraseMatcher, and EntityRuler

* Remove unnecessary .keys()

* Rephrase error messages

* Add another type check to Matcher

Add another type check to Matcher for more understandable error messages
in some rare cases.

* Support phrase_matcher_attr=TEXT for EntityRuler

* Don't use spacy.errors in examples and bin scripts

* Fix error code

* Auto-format

Also try get Azure pipelines to finally start a build :(

* Update errors.py


Co-authored-by: Ines Montani <ines@ines.io>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
---
 bin/wiki_entity_linking/kb_creator.py         |  7 +-
 .../wikidata_pretrain_kb.py                   |  5 +-
 .../wikidata_train_entity_linker.py           |  9 +--
 examples/training/pretrain_kb.py              |  4 +-
 examples/training/train_entity_linker.py      |  2 -
 spacy/displacy/render.py                      |  2 +-
 spacy/errors.py                               | 25 +++++--
 spacy/matcher/_schemas.py                     |  4 ++
 spacy/matcher/matcher.pyx                     | 29 ++++++--
 spacy/matcher/phrasematcher.pyx               | 10 +++
 spacy/pipeline/entityruler.py                 |  2 +
 .../tests/matcher/test_pattern_validation.py  | 71 +++++++++++++------
 spacy/tests/matcher/test_phrase_matcher.py    | 33 +++++++++
 spacy/tests/pipeline/test_entity_ruler.py     |  5 +-
 website/docs/usage/rule-based-matching.md     | 12 ++--
 15 files changed, 162 insertions(+), 58 deletions(-)

diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py
index d88cf9c7e..bd862f536 100644
--- a/bin/wiki_entity_linking/kb_creator.py
+++ b/bin/wiki_entity_linking/kb_creator.py
@@ -8,8 +8,6 @@ from spacy.kb import KnowledgeBase
 import csv
 import datetime
 
-from spacy import Errors
-
 
 def create_kb(
     nlp,
@@ -33,7 +31,10 @@ def create_kb(
         input_dim = nlp.vocab.vectors_length
         print("Loaded pre-trained vectors of size %s" % input_dim)
     else:
-        raise ValueError(Errors.E155)
+        raise ValueError(
+            "The `nlp` object should have access to pre-trained word vectors, "
+            " cf. https://spacy.io/usage/models#languages."
+        )
 
     # disable this part of the pipeline when rerunning the KB generation from preprocessed files
     if read_raw_data:
diff --git a/bin/wiki_entity_linking/wikidata_pretrain_kb.py b/bin/wiki_entity_linking/wikidata_pretrain_kb.py
index c9ca470cf..c5261cada 100644
--- a/bin/wiki_entity_linking/wikidata_pretrain_kb.py
+++ b/bin/wiki_entity_linking/wikidata_pretrain_kb.py
@@ -73,7 +73,10 @@ def main(
 
     # check the length of the nlp vectors
     if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
-        raise ValueError(Errors.E155)
+        raise ValueError(
+            "The `nlp` object should have access to pre-trained word vectors, "
+            " cf. https://spacy.io/usage/models#languages."
+        )
 
     # STEP 2: create prior probabilities from WP
     print()
diff --git a/bin/wiki_entity_linking/wikidata_train_entity_linker.py b/bin/wiki_entity_linking/wikidata_train_entity_linker.py
index a988b8f69..770919112 100644
--- a/bin/wiki_entity_linking/wikidata_train_entity_linker.py
+++ b/bin/wiki_entity_linking/wikidata_train_entity_linker.py
@@ -19,8 +19,6 @@ from bin.wiki_entity_linking import training_set_creator
 
 import spacy
 from spacy.kb import KnowledgeBase
-
-from spacy import Errors
 from spacy.util import minibatch, compounding
 
 
@@ -68,7 +66,7 @@ def main(
 
     # check that there is a NER component in the pipeline
     if "ner" not in nlp.pipe_names:
-        raise ValueError(Errors.E152)
+        raise ValueError("The `nlp` object should have a pre-trained `ner` component.")
 
     # STEP 2 : read the KB
     print()
@@ -82,7 +80,10 @@ def main(
         print(now(), "STEP 3: reading training dataset from", loc_training)
     else:
         if not wp_xml:
-            raise ValueError(Errors.E153)
+            raise ValueError(
+                "Either provide a path to a preprocessed training directory, "
+                "or to the original Wikipedia XML dump."
+            )
 
         if output_dir:
             loc_training = output_dir / "training_data"
diff --git a/examples/training/pretrain_kb.py b/examples/training/pretrain_kb.py
index 67cc1587d..d5281ad42 100644
--- a/examples/training/pretrain_kb.py
+++ b/examples/training/pretrain_kb.py
@@ -17,12 +17,10 @@ import plac
 from pathlib import Path
 
 from spacy.vocab import Vocab
-
 import spacy
 from spacy.kb import KnowledgeBase
 
 from bin.wiki_entity_linking.train_descriptions import EntityEncoder
-from spacy import Errors
 
 
 # Q2146908 (Russ Cochran): American golfer
@@ -45,7 +43,7 @@ def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
     If an output_dir is provided, the KB will be stored there in a file 'kb'.
     When providing an nlp model, the updated vocab will also be written to a directory in the output_dir."""
     if model is None and vocab_path is None:
-        raise ValueError(Errors.E154)
+        raise ValueError("Either the `nlp` model or the `vocab` should be specified.")
 
     if model is not None:
         nlp = spacy.load(model)  # load existing spaCy model
diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py
index f925672f1..12ed531a6 100644
--- a/examples/training/train_entity_linker.py
+++ b/examples/training/train_entity_linker.py
@@ -22,8 +22,6 @@ from spacy.vocab import Vocab
 
 import spacy
 from spacy.kb import KnowledgeBase
-
-from spacy import Errors
 from spacy.tokens import Span
 from spacy.util import minibatch, compounding
 
diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index fb25b60e2..23868c8bd 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -128,7 +128,7 @@ class DependencyRenderer(object):
         """
         if start < 0 or end < 0:
             error_args = dict(start=start, end=end, label=label, dir=direction)
-            raise ValueError(Errors.E156.format(**error_args))
+            raise ValueError(Errors.E157.format(**error_args))
         level = self.levels.index(end - start) + 1
         x_start = self.offset_x + start * self.distance + self.arrow_spacing
         if self.direction == "rtl":
diff --git a/spacy/errors.py b/spacy/errors.py
index 2a5150e8e..7811374a7 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -431,13 +431,24 @@ class Errors(object):
             "same, but found '{nlp}' and '{vocab}' respectively.")
     E151 = ("Trying to call nlp.update without required annotation types. "
             "Expected top-level keys: {exp}. Got: {unexp}.")
-    E152 = ("The `nlp` object should have a pre-trained `ner` component.")
-    E153 = ("Either provide a path to a preprocessed training directory, "
-            "or to the original Wikipedia XML dump.")
-    E154 = ("Either the `nlp` model or the `vocab` should be specified.")
-    E155 = ("The `nlp` object should have access to pre-trained word vectors, "
-            " cf. https://spacy.io/usage/models#languages.")
-    E156 = ("Can't render negative values for dependency arc start or end. "
+    E152 = ("The attribute {attr} is not supported for token patterns. "
+            "Please use the option validate=True with Matcher, PhraseMatcher, "
+            "or EntityRuler for more details.")
+    E153 = ("The value type {vtype} is not supported for token patterns. "
+            "Please use the option validate=True with Matcher, PhraseMatcher, "
+            "or EntityRuler for more details.")
+    E154 = ("One of the attributes or values is not supported for token "
+            "patterns. Please use the option validate=True with Matcher, "
+            "PhraseMatcher, or EntityRuler for more details.")
+    E155 = ("The pipeline needs to include a tagger in order to use "
+            "PhraseMatcher with the attributes POS, TAG, or LEMMA. Try using "
+            "nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
+            "list(nlp.tokenizer.pipe()).")
+    E156 = ("The pipeline needs to include a parser in order to use "
+            "PhraseMatcher with the attribute DEP. Try using "
+            "nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
+            "list(nlp.tokenizer.pipe()).")
+    E157 = ("Can't render negative values for dependency arc start or end. "
             "Make sure that you're passing in absolute token indices, not "
             "relative token offsets.\nstart: {start}, end: {end}, label: "
             "{label}, direction: {dir}")
diff --git a/spacy/matcher/_schemas.py b/spacy/matcher/_schemas.py
index f4dfebb6b..4f4992781 100644
--- a/spacy/matcher/_schemas.py
+++ b/spacy/matcher/_schemas.py
@@ -102,6 +102,10 @@ TOKEN_PATTERN_SCHEMA = {
                 "title": "Entity label of single token",
                 "$ref": "#/definitions/string_value",
             },
+            "NORM": {
+                "title": "Normalized form of the token text",
+                "$ref": "#/definitions/string_value",
+            },
             "LENGTH": {
                 "title": "Token character length",
                 "$ref": "#/definitions/integer_value",
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 86658ce99..3226dd124 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -112,9 +112,12 @@ cdef class Matcher:
             raise MatchPatternError(key, errors)
         key = self._normalize_key(key)
         for pattern in patterns:
-            specs = _preprocess_pattern(pattern, self.vocab.strings,
-                self._extensions, self._extra_predicates)
-            self.patterns.push_back(init_pattern(self.mem, key, specs))
+            try:
+                specs = _preprocess_pattern(pattern, self.vocab.strings,
+                    self._extensions, self._extra_predicates)
+                self.patterns.push_back(init_pattern(self.mem, key, specs))
+            except OverflowError, AttributeError:
+                raise ValueError(Errors.E154.format())
         self._patterns.setdefault(key, [])
         self._callbacks[key] = on_match
         self._patterns[key].extend(patterns)
@@ -568,6 +571,8 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
             # Signifier for 'any token'
             tokens.append((ONE, [(NULL_ATTR, 0)], [], []))
             continue
+        if not isinstance(spec, dict):
+            raise ValueError(Errors.E154.format())
         ops = _get_operators(spec)
         attr_values = _get_attr_values(spec, string_store)
         extensions = _get_extensions(spec, string_store, extensions_table)
@@ -581,21 +586,29 @@ def _get_attr_values(spec, string_store):
     attr_values = []
     for attr, value in spec.items():
         if isinstance(attr, basestring):
+            attr = attr.upper()
             if attr == '_':
                 continue
-            elif attr.upper() == "OP":
+            elif attr == "OP":
                 continue
-            if attr.upper() == "TEXT":
+            if attr == "TEXT":
                 attr = "ORTH"
-            attr = IDS.get(attr.upper())
+            if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]:
+                raise ValueError(Errors.E152.format(attr=attr))
+            attr = IDS.get(attr)
         if isinstance(value, basestring):
             value = string_store.add(value)
         elif isinstance(value, bool):
             value = int(value)
         elif isinstance(value, dict):
             continue
+        else:
+            raise ValueError(Errors.E153.format(vtype=type(value).__name__))
         if attr is not None:
             attr_values.append((attr, value))
+        else:
+            # should be caught above using TOKEN_PATTERN_SCHEMA
+            raise ValueError(Errors.E152.format(attr=attr))
     return attr_values
 
 
@@ -755,11 +768,13 @@ def _get_operators(spec):
         return lookup[spec["OP"]]
     else:
         keys = ", ".join(lookup.keys())
-        raise KeyError(Errors.E011.format(op=spec["OP"], opts=keys))
+        raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
 
 
 def _get_extensions(spec, string_store, name2index):
     attr_values = []
+    if not isinstance(spec.get("_", {}), dict):
+        raise ValueError(Errors.E154.format())
     for name, value in spec.get("_", {}).items():
         if isinstance(value, dict):
             # Handle predicates (e.g. "IN", in the extra_predicates, not here.
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 68821a085..3a8bec2df 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -12,6 +12,7 @@ from ..vocab cimport Vocab
 from ..tokens.doc cimport Doc, get_token_attr
 from ..typedefs cimport attr_t, hash_t
 
+from ._schemas import TOKEN_PATTERN_SCHEMA
 from ..errors import Errors, Warnings, deprecation_warning, user_warning
 from ..attrs import FLAG61 as U_ENT
 from ..attrs import FLAG60 as B2_ENT
@@ -62,6 +63,11 @@ cdef class PhraseMatcher:
         if isinstance(attr, long):
             self.attr = attr
         else:
+            attr = attr.upper()
+            if attr == "TEXT":
+                attr = "ORTH"
+            if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]:
+                raise ValueError(Errors.E152.format(attr=attr))
             self.attr = self.vocab.strings[attr]
         self.phrase_ids = PreshMap()
         abstract_patterns = [
@@ -123,6 +129,10 @@ cdef class PhraseMatcher:
             length = doc.length
             if length == 0:
                 continue
+            if self.attr in (POS, TAG, LEMMA) and not doc.is_tagged:
+                raise ValueError(Errors.E155.format())
+            if self.attr == DEP and not doc.is_parsed:
+                raise ValueError(Errors.E156.format())
             if self._validate and (doc.is_tagged or doc.is_parsed) \
               and self.attr not in (DEP, POS, TAG, LEMMA):
                 string_attr = self.vocab.strings[self.attr]
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 23c8c91ba..3f6f8811c 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -54,6 +54,8 @@ class EntityRuler(object):
         self.phrase_patterns = defaultdict(list)
         self.matcher = Matcher(nlp.vocab, validate=validate)
         if phrase_matcher_attr is not None:
+            if phrase_matcher_attr.upper() == "TEXT":
+                phrase_matcher_attr = "ORTH"
             self.phrase_matcher_attr = phrase_matcher_attr
             self.phrase_matcher = PhraseMatcher(
                 nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py
index a0a8d3a14..665bcf935 100644
--- a/spacy/tests/matcher/test_pattern_validation.py
+++ b/spacy/tests/matcher/test_pattern_validation.py
@@ -7,6 +7,36 @@ from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA
 from spacy.errors import MatchPatternError
 from spacy.util import get_json_validator, validate_json
 
+# (pattern, num errors with validation, num errors identified with minimal
+#  checks)
+TEST_PATTERNS = [
+    # Bad patterns flagged in all cases
+    ([{"XX": "foo"}], 1, 1),
+    ([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2, 1),
+    ([{"IS_ALPHA": {"==": True}}, {"LIKE_NUM": None}], 2, 1),
+    ([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
+    ([{"IS_DIGIT": -1}], 1, 1),
+    ([{"ORTH": -1}], 1, 1),
+    ([{"_": "foo"}], 1, 1),
+    ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
+    ([1, 2, 3], 3, 1),
+    # Bad patterns flagged outside of Matcher
+    ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1, 0),
+    # Bad patterns not flagged with minimal checks
+    ([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2, 0),
+    ([{"LENGTH": {"VALUE": 5}}], 1, 0),
+    ([{"TEXT": {"VALUE": "foo"}}], 1, 0),
+    # Good patterns
+    ([{"TEXT": "foo"}, {"LOWER": "bar"}], 0, 0),
+    ([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0, 0),
+    ([{"LIKE_NUM": True, "LENGTH": {">=": 5}}], 0, 0),
+    ([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0, 0),
+    ([{"NORM": "a"}, {"POS": {"IN": ["NOUN"]}}], 0, 0),
+    ([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0, 0),
+]
+
+XFAIL_TEST_PATTERNS = [([{"orth": "foo"}], 0, 0)]
+
 
 @pytest.fixture
 def validator():
@@ -22,27 +52,24 @@ def test_matcher_pattern_validation(en_vocab, pattern):
         matcher.add("TEST", None, pattern)
 
 
-@pytest.mark.parametrize(
-    "pattern,n_errors",
-    [
-        # Bad patterns
-        ([{"XX": "foo"}], 1),
-        ([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2),
-        ([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2),
-        ([{"IS_ALPHA": {"==": True}}, {"LIKE_NUM": None}], 2),
-        ([{"TEXT": {"VALUE": "foo"}}], 1),
-        ([{"LENGTH": {"VALUE": 5}}], 1),
-        ([{"_": "foo"}], 1),
-        ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1),
-        ([{"IS_PUNCT": True, "OP": "$"}], 1),
-        # Good patterns
-        ([{"TEXT": "foo"}, {"LOWER": "bar"}], 0),
-        ([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0),
-        ([{"LIKE_NUM": True, "LENGTH": {">=": 5}}], 0),
-        ([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0),
-        ([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0),
-    ],
-)
-def test_pattern_validation(validator, pattern, n_errors):
+@pytest.mark.parametrize("pattern,n_errors,_", TEST_PATTERNS)
+def test_pattern_validation(validator, pattern, n_errors, _):
     errors = validate_json(pattern, validator)
     assert len(errors) == n_errors
+
+
+@pytest.mark.xfail
+@pytest.mark.parametrize("pattern,n_errors,_", XFAIL_TEST_PATTERNS)
+def test_xfail_pattern_validation(validator, pattern, n_errors, _):
+    errors = validate_json(pattern, validator)
+    assert len(errors) == n_errors
+
+
+@pytest.mark.parametrize("pattern,n_errors,n_min_errors", TEST_PATTERNS)
+def test_minimal_pattern_validation(en_vocab, pattern, n_errors, n_min_errors):
+    matcher = Matcher(en_vocab)
+    if n_min_errors > 0:
+        with pytest.raises(ValueError):
+            matcher.add("TEST", None, pattern)
+    elif n_errors == 0:
+        matcher.add("TEST", None, pattern)
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index a5fdb345e..b82f9a058 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -99,3 +99,36 @@ def test_phrase_matcher_validation(en_vocab):
     with pytest.warns(None) as record:
         matcher.add("TEST4", None, doc2)
         assert not record.list
+
+
+def test_attr_validation(en_vocab):
+    with pytest.raises(ValueError):
+        PhraseMatcher(en_vocab, attr="UNSUPPORTED")
+
+
+def test_attr_pipeline_checks(en_vocab):
+    doc1 = Doc(en_vocab, words=["Test"])
+    doc1.is_parsed = True
+    doc2 = Doc(en_vocab, words=["Test"])
+    doc2.is_tagged = True
+    doc3 = Doc(en_vocab, words=["Test"])
+    # DEP requires is_parsed
+    matcher = PhraseMatcher(en_vocab, attr="DEP")
+    matcher.add("TEST1", None, doc1)
+    with pytest.raises(ValueError):
+        matcher.add("TEST2", None, doc2)
+    with pytest.raises(ValueError):
+        matcher.add("TEST3", None, doc3)
+    # TAG, POS, LEMMA require is_tagged
+    for attr in ("TAG", "POS", "LEMMA"):
+        matcher = PhraseMatcher(en_vocab, attr=attr)
+        matcher.add("TEST2", None, doc2)
+        with pytest.raises(ValueError):
+            matcher.add("TEST1", None, doc1)
+        with pytest.raises(ValueError):
+            matcher.add("TEST3", None, doc3)
+    # TEXT/ORTH only require tokens
+    matcher = PhraseMatcher(en_vocab, attr="ORTH")
+    matcher.add("TEST3", None, doc3)
+    matcher = PhraseMatcher(en_vocab, attr="TEXT")
+    matcher.add("TEST3", None, doc3)
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 57e980ec3..660ad3b28 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -137,8 +137,9 @@ def test_entity_ruler_validate(nlp):
     valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]}
     invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]}
 
-    # invalid pattern is added without errors without validate
-    ruler.add_patterns([invalid_pattern])
+    # invalid pattern raises error without validate
+    with pytest.raises(ValueError):
+        ruler.add_patterns([invalid_pattern])
 
     # valid pattern is added without errors with validate
     validated_ruler.add_patterns([valid_pattern])
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index fbe8bdc67..1d67625a5 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -859,12 +859,12 @@ token pattern covering the exact tokenization of the term.
 <Infobox title="Important note on creating patterns" variant="warning">
 
 To create the patterns, each phrase has to be processed with the `nlp` object.
-If you have a mode loaded, doing this in a loop or list comprehension can easily
-become inefficient and slow. If you **only need the tokenization and lexical
-attributes**, you can run [`nlp.make_doc`](/api/language#make_doc) instead,
-which will only run the tokenizer. For an additional speed boost, you can also
-use the [`nlp.tokenizer.pipe`](/api/tokenizer#pipe) method, which will process
-the texts as a stream.
+If you have a model loaded, doing this in a loop or list comprehension can
+easily become inefficient and slow. If you **only need the tokenization and
+lexical attributes**, you can run [`nlp.make_doc`](/api/language#make_doc)
+instead, which will only run the tokenizer. For an additional speed boost, you
+can also use the [`nlp.tokenizer.pipe`](/api/tokenizer#pipe) method, which will
+process the texts as a stream.
 
 ```diff
 - patterns = [nlp(term) for term in LOTS_OF_TERMS]

From 4fe9329bfba009d626e1a442eeaeb094961a5817 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pavle=20Vidanovi=C4=87?= <pavle.vidanovic@gmail.com>
Date: Wed, 21 Aug 2019 19:57:37 +0200
Subject: [PATCH 2/4] Serbian language code update "rs" -> "sr" (#4159)

* Serbian stopwords added. (cyrillic alphabet)

* spaCy Contribution agreement included.

* Test initialize updated

* Serbian language code update. --bugfix
---
 spacy/lang/{rs => sr}/__init__.py   | 4 ++--
 spacy/lang/{rs => sr}/stop_words.py | 0
 spacy/tests/lang/test_initialize.py | 4 ++--
 website/meta/languages.json         | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)
 rename spacy/lang/{rs => sr}/__init__.py (91%)
 rename spacy/lang/{rs => sr}/stop_words.py (100%)

diff --git a/spacy/lang/rs/__init__.py b/spacy/lang/sr/__init__.py
similarity index 91%
rename from spacy/lang/rs/__init__.py
rename to spacy/lang/sr/__init__.py
index 4228fb193..99090be3b 100644
--- a/spacy/lang/rs/__init__.py
+++ b/spacy/lang/sr/__init__.py
@@ -12,7 +12,7 @@ from ...util import update_exc, add_lookups
 
 class SerbianDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "rs"
+    lex_attr_getters[LANG] = lambda text: "sr"
     lex_attr_getters[NORM] = add_lookups(
         Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
     )
@@ -21,7 +21,7 @@ class SerbianDefaults(Language.Defaults):
 
 
 class Serbian(Language):
-    lang = "rs"
+    lang = "sr"
     Defaults = SerbianDefaults
 
 
diff --git a/spacy/lang/rs/stop_words.py b/spacy/lang/sr/stop_words.py
similarity index 100%
rename from spacy/lang/rs/stop_words.py
rename to spacy/lang/sr/stop_words.py
diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py
index f6938c252..9b01340e3 100644
--- a/spacy/tests/lang/test_initialize.py
+++ b/spacy/tests/lang/test_initialize.py
@@ -10,8 +10,8 @@ from spacy.util import get_lang_class
 # excluded: ja, ru, th, uk, vi, zh
 LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
              "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
-             "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "rs", "si",
-             "sk", "sl", "sq", "sv", "ta", "te", "tl", "tr", "tt", "ur"]
+             "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
+             "sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur"]
 # fmt: on
 
 
diff --git a/website/meta/languages.json b/website/meta/languages.json
index 8f91698ec..0a54efead 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -127,7 +127,7 @@
         { "code": "is", "name": "Icelandic" },
         { "code": "lt", "name": "Lithuanian" },
         { "code": "lv", "name": "Latvian" },
-        { "code": "rs", "name": "Serbian" },
+        { "code": "sr", "name": "Serbian" },
         { "code": "sk", "name": "Slovak" },
         { "code": "sl", "name": "Slovenian" },
         {

From 2d17b047e22be4500bbd2f83898fbca2168917f4 Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Wed, 21 Aug 2019 20:52:36 +0200
Subject: [PATCH 3/4] Check for is_tagged/is_parsed for Matcher attrs (#4163)

Check for relevant components in the pipeline when Matcher is called,
similar to the checks for PhraseMatcher in #4105.

* keep track of attributes seen in patterns

* when Matcher is called on a Doc, check for is_tagged for LEMMA, TAG,
POS and for is_parsed for DEP
---
 spacy/errors.py                         |  8 +++---
 spacy/matcher/matcher.pxd               |  1 +
 spacy/matcher/matcher.pyx               | 12 +++++++--
 spacy/tests/matcher/test_matcher_api.py | 36 +++++++++++++++++++++++++
 4 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 7811374a7..489f70ca7 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -441,11 +441,11 @@ class Errors(object):
             "patterns. Please use the option validate=True with Matcher, "
             "PhraseMatcher, or EntityRuler for more details.")
     E155 = ("The pipeline needs to include a tagger in order to use "
-            "PhraseMatcher with the attributes POS, TAG, or LEMMA. Try using "
-            "nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
-            "list(nlp.tokenizer.pipe()).")
+            "Matcher or PhraseMatcher with the attributes POS, TAG, or LEMMA. "
+            "Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
+            "instead of list(nlp.tokenizer.pipe()).")
     E156 = ("The pipeline needs to include a parser in order to use "
-            "PhraseMatcher with the attribute DEP. Try using "
+            "Matcher or PhraseMatcher with the attribute DEP. Try using "
             "nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
             "list(nlp.tokenizer.pipe()).")
     E157 = ("Can't render negative values for dependency arc start or end. "
diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd
index 2e46cb376..a146cd107 100644
--- a/spacy/matcher/matcher.pxd
+++ b/spacy/matcher/matcher.pxd
@@ -67,3 +67,4 @@ cdef class Matcher:
     cdef public object _callbacks
     cdef public object _extensions
     cdef public object _extra_predicates
+    cdef public object _seen_attrs
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 3226dd124..a79542929 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -15,7 +15,7 @@ from ..structs cimport TokenC
 from ..vocab cimport Vocab
 from ..tokens.doc cimport Doc, get_token_attr
 from ..tokens.token cimport Token
-from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH
+from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA
 
 from ._schemas import TOKEN_PATTERN_SCHEMA
 from ..util import get_json_validator, validate_json
@@ -45,7 +45,7 @@ cdef class Matcher:
         self._patterns = {}
         self._callbacks = {}
         self._extensions = {}
-        self._extra_predicates = []
+        self._seen_attrs = set()
         self.vocab = vocab
         self.mem = Pool()
         if validate:
@@ -116,6 +116,9 @@ cdef class Matcher:
                 specs = _preprocess_pattern(pattern, self.vocab.strings,
                     self._extensions, self._extra_predicates)
                 self.patterns.push_back(init_pattern(self.mem, key, specs))
+                for spec in specs:
+                    for attr, _ in spec[1]:
+                        self._seen_attrs.add(attr)
             except OverflowError, AttributeError:
                 raise ValueError(Errors.E154.format())
         self._patterns.setdefault(key, [])
@@ -180,6 +183,11 @@ cdef class Matcher:
             describing the matches. A match tuple describes a span
             `doc[start:end]`. The `label_id` and `key` are both integers.
         """
+        if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \
+          and not doc.is_tagged:
+            raise ValueError(Errors.E155.format())
+        if DEP in self._seen_attrs and not doc.is_parsed:
+            raise ValueError(Errors.E156.format())
         matches = find_matches(&self.patterns[0], self.patterns.size(), doc,
                                extensions=self._extensions,
                                predicates=self._extra_predicates)
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 013700d52..401b7c928 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -344,3 +344,39 @@ def test_dependency_matcher_compile(dependency_matcher):
 #     assert matches[0][1] == [[3, 1, 2]]
 #     assert matches[1][1] == [[4, 3, 3]]
 #     assert matches[2][1] == [[4, 3, 2]]
+
+
+def test_attr_pipeline_checks(en_vocab):
+    doc1 = Doc(en_vocab, words=["Test"])
+    doc1.is_parsed = True
+    doc2 = Doc(en_vocab, words=["Test"])
+    doc2.is_tagged = True
+    doc3 = Doc(en_vocab, words=["Test"])
+    # DEP requires is_parsed
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", None, [{"DEP": "a"}])
+    matcher(doc1)
+    with pytest.raises(ValueError):
+        matcher(doc2)
+    with pytest.raises(ValueError):
+        matcher(doc3)
+    # TAG, POS, LEMMA require is_tagged
+    for attr in ("TAG", "POS", "LEMMA"):
+        matcher = Matcher(en_vocab)
+        matcher.add("TEST", None, [{attr: "a"}])
+        matcher(doc2)
+        with pytest.raises(ValueError):
+            matcher(doc1)
+        with pytest.raises(ValueError):
+            matcher(doc3)
+    # TEXT/ORTH only require tokens
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", None, [{"ORTH": "a"}])
+    matcher(doc1)
+    matcher(doc2)
+    matcher(doc3)
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", None, [{"TEXT": "a"}])
+    matcher(doc1)
+    matcher(doc2)
+    matcher(doc3)

From b072c13017b8026d6767b6228863241895718bb0 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 21 Aug 2019 21:35:37 +0200
Subject: [PATCH 4/4] Update universe with videos [ci skip]

---
 website/meta/universe.json        | 56 +++++++++++++++++++++++++++++++
 website/src/templates/universe.js |  1 +
 2 files changed, 57 insertions(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index c66915e8e..c3eee3901 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1089,6 +1089,62 @@
             "youtube": "6zm9NC9uRkk",
             "category": ["videos"]
         },
+        {
+            "type": "education",
+            "id": "video-intro-to-nlp-episode-1",
+            "title": "Intro to NLP with spaCy",
+            "slogan": "Episode 1: Data exploration",
+            "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.",
+            "author": "Vincent Warmerdam",
+            "author_links": {
+                "twitter": "fishnets88",
+                "github": "koaning"
+            },
+            "youtube": "WnGPv6HnBok",
+            "category": ["videos"]
+        },
+        {
+            "type": "education",
+            "id": "video-spacy-irl-entity-linking",
+            "title": "Entity Linking functionality in spaCy",
+            "slogan": "spaCy IRL 2019",
+            "url": "https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc",
+            "author": "Sofie Van Landeghem",
+            "author_links": {
+                "twitter": "OxyKodit",
+                "github": "svlandeg"
+            },
+            "youtube": "PW3RJM8tDGo",
+            "category": ["videos"]
+        },
+        {
+            "type": "education",
+            "id": "video-spacy-irl-lemmatization",
+            "title": "Rethinking rule-based lemmatization",
+            "slogan": "spaCy IRL 2019",
+            "url": "https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc",
+            "author": "Guadalupe Romero",
+            "author_links": {
+                "twitter": "_guadiromero",
+                "github": "guadi1994"
+            },
+            "youtube": "88zcQODyuko",
+            "category": ["videos"]
+        },
+        {
+            "type": "education",
+            "id": "video-spacy-irl-scispacy",
+            "title": "ScispaCy: A spaCy pipeline & models for scientific & biomedical text",
+            "slogan": "spaCy IRL 2019",
+            "url": "https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc",
+            "author": "Mark Neumann",
+            "author_links": {
+                "twitter": "MarkNeumannnn",
+                "github": "DeNeutoy"
+            },
+            "youtube": "2_HSKDALwuw",
+            "category": ["videos"]
+        },
         {
             "type": "education",
             "id": "podcast-nlp-highlights",
diff --git a/website/src/templates/universe.js b/website/src/templates/universe.js
index 379b1a541..e49e81b01 100644
--- a/website/src/templates/universe.js
+++ b/website/src/templates/universe.js
@@ -86,6 +86,7 @@ const UniverseContent = ({ content = [], categories, pageContext, location, mdxC
                                         <img
                                             src={`https://img.youtube.com/vi/${youtube}/0.jpg`}
                                             alt=""
+                                            style={{ clipPath: 'inset(12.5% 0)' }}
                                         />
                                     )
                                     return cover ? (