From ad2a514cdfbfaa8a41b48d3954a76e4e2c616eaa Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 12 Feb 2019 15:45:31 +0100
Subject: [PATCH] Show warning if phrase pattern Doc was overprocessed (#3255)

In most cases, the PhraseMatcher will match on the verbatim token text or as of v2.1, sometimes the lowercase text. This means that we only need a tokenized Doc, without any other attributes.

If phrase patterns are created by processing large terminology lists with the full `nlp` object, this easily can make things a lot slower, because all components will be applied, even if we don't actually need the attributes they set (like part-of-speech tags, dependency labels).

The warning message also includes a suggestion to use nlp.make_doc or nlp.tokenizer.pipe for even faster processing. For now, the validation has to be enabled explicitly by setting validate=True.
---
 spacy/errors.py                            |  7 +++++++
 spacy/matcher/phrasematcher.pyx            | 12 +++++++++---
 spacy/tests/matcher/test_phrase_matcher.py | 21 +++++++++++++++++++++
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 89fb9c289..a0da31a1e 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -60,6 +60,13 @@ class Warnings(object):
             "make displaCy start another one. Instead, you should be able to "
             "replace displacy.serve with displacy.render to show the "
             "visualization.")
+    W012 = ("A Doc object you're adding to the PhraseMatcher for pattern "
+            "'{key}' is parsed and/or tagged, but to match on '{attr}', you "
+            "don't actually need this information. This means that creating "
+            "the patterns is potentially much slower, because all pipeline "
+            "components are applied. To only create tokenized Doc objects, "
+            "try using `nlp.make_doc(text)` or process all texts as a stream "
+            "using `list(nlp.tokenizer.pipe(all_texts))`.")
 
 
 @add_codes
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 3cc890dca..4abf275be 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -7,12 +7,12 @@ from murmurhash.mrmr cimport hash64
 from preshed.maps cimport PreshMap
 
 from .matcher cimport Matcher
-from ..attrs cimport ORTH, attr_id_t
+from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, attr_id_t
 from ..vocab cimport Vocab
 from ..tokens.doc cimport Doc, get_token_attr
 from ..typedefs cimport attr_t, hash_t
 
-from ..errors import Warnings, deprecation_warning
+from ..errors import Warnings, deprecation_warning, user_warning
 from ..attrs import FLAG61 as U_ENT
 from ..attrs import FLAG60 as B2_ENT
 from ..attrs import FLAG59 as B3_ENT
@@ -33,8 +33,9 @@ cdef class PhraseMatcher:
     cdef attr_id_t attr
     cdef public object _callbacks
     cdef public object _patterns
+    cdef public object _validate
 
-    def __init__(self, Vocab vocab, max_length=0, attr='ORTH'):
+    def __init__(self, Vocab vocab, max_length=0, attr='ORTH', validate=False):
         if max_length != 0:
             deprecation_warning(Warnings.W010)
         self.mem = Pool()
@@ -54,6 +55,7 @@ cdef class PhraseMatcher:
         ]
         self.matcher.add('Candidate', None, *abstract_patterns)
         self._callbacks = {}
+        self._validate = validate
 
     def __len__(self):
         """Get the number of rules added to the matcher. Note that this only
@@ -95,6 +97,10 @@ cdef class PhraseMatcher:
             length = doc.length
             if length == 0:
                 continue
+            if self._validate and (doc.is_tagged or doc.is_parsed) \
+              and self.attr not in (DEP, POS, TAG, LEMMA):
+                string_attr = self.vocab.strings[self.attr]
+                user_warning(Warnings.W012.format(key=key, attr=string_attr))
             tags = get_bilou(length)
             phrase_key = <attr_t*>mem.alloc(length, sizeof(attr_t))
             for i, tag in enumerate(tags):
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 9ecd61465..a5fdb345e 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -1,6 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import pytest
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc
 from ..util import get_doc
@@ -78,3 +79,23 @@ def test_phrase_matcher_bool_attrs(en_vocab):
     assert end1 == 3
     assert start2 == 3
     assert end2 == 6
+
+
+def test_phrase_matcher_validation(en_vocab):
+    doc1 = Doc(en_vocab, words=["Test"])
+    doc1.is_parsed = True
+    doc2 = Doc(en_vocab, words=["Test"])
+    doc2.is_tagged = True
+    doc3 = Doc(en_vocab, words=["Test"])
+    matcher = PhraseMatcher(en_vocab, validate=True)
+    with pytest.warns(UserWarning):
+        matcher.add("TEST1", None, doc1)
+    with pytest.warns(UserWarning):
+        matcher.add("TEST2", None, doc2)
+    with pytest.warns(None) as record:
+        matcher.add("TEST3", None, doc3)
+        assert not record.list
+    matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
+    with pytest.warns(None) as record:
+        matcher.add("TEST4", None, doc2)
+        assert not record.list