From 58e83fe34bbec777db588f19723668c2ec431f71 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 21 Sep 2016 14:54:55 +0200
Subject: [PATCH] Initial, limited support for quantified patterns in Matcher,
 and tracking of ent_id attribute in Token and Span. The quantifiers need a
 lot more testing, and there are some known problems. The main known problem
 is that the zero-plus and one-plus quantifiers won't work if a token can
 match both the quantified pattern expression AND the tail of the match.

---
 spacy/matcher.pyx           | 190 +++++++++++++++++++++++++-----------
 spacy/structs.pxd           |   4 +-
 spacy/tests/test_matcher.py |  77 +++++++++++----
 spacy/tokens/span.pyx       |  21 ++++
 spacy/tokens/token.pxd      |   3 +-
 spacy/tokens/token.pyx      |  37 ++++++-
 6 files changed, 250 insertions(+), 82 deletions(-)

diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx
index 64fd4e683..017ac5973 100644
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@@ -12,9 +12,11 @@ from .lexeme cimport Lexeme
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
+from libcpp.pair cimport pair
 from murmurhash.mrmr cimport hash64
+from libc.stdint cimport int32_t
 
-from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
+from .attrs cimport ID, LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
 from . import attrs
 from .tokens.doc cimport get_token_attr
 from .tokens.doc cimport Doc
@@ -59,58 +61,96 @@ except ImportError:
     import json
 
 
-cdef struct AttrValue:
+cpdef enum quantifier_t:
+    _META
+    ONE
+    ZERO
+    ZERO_ONE
+    ZERO_PLUS
+
+
+cdef enum action_t:
+    REJECT
+    ADVANCE
+    REPEAT
+    ACCEPT
+    ADVANCE_ZERO
+    PANIC
+
+
+cdef struct AttrValueC:
     attr_id_t attr
     attr_t value
 
 
-cdef struct Pattern:
-    AttrValue* spec
-    int length
+cdef struct TokenPatternC:
+    AttrValueC* attrs
+    int32_t nr_attr
+    quantifier_t quantifier
 
 
-cdef Pattern* init_pattern(Pool mem, object token_specs, attr_t entity_type) except NULL:
-    pattern = <Pattern*>mem.alloc(len(token_specs) + 1, sizeof(Pattern))
+ctypedef TokenPatternC* TokenPatternC_ptr
+ctypedef pair[int, TokenPatternC_ptr] StateC
+
+
+cdef TokenPatternC* init_pattern(Pool mem, object token_specs, attr_t entity_id,
+                                 attr_t entity_type) except NULL:
+    pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC))
     cdef int i
-    for i, spec in enumerate(token_specs):
-        pattern[i].spec = <AttrValue*>mem.alloc(len(spec), sizeof(AttrValue))
-        pattern[i].length = len(spec)
+    for i, (quantifier, spec) in enumerate(token_specs):
+        pattern[i].quantifier = quantifier
+        pattern[i].attrs = <AttrValueC*>mem.alloc(len(spec), sizeof(AttrValueC))
+        pattern[i].nr_attr = len(spec)
         for j, (attr, value) in enumerate(spec):
-            pattern[i].spec[j].attr = attr
-            pattern[i].spec[j].value = value
+            pattern[i].attrs[j].attr = attr
+            pattern[i].attrs[j].value = value
     i = len(token_specs)
-    pattern[i].spec = <AttrValue*>mem.alloc(2, sizeof(AttrValue))
-    pattern[i].spec[0].attr = ENT_TYPE
-    pattern[i].spec[0].value = entity_type
-    pattern[i].spec[1].attr = LENGTH
-    pattern[i].spec[1].value = len(token_specs)
-    pattern[i].length = 0
+    pattern[i].attrs = <AttrValueC*>mem.alloc(3, sizeof(AttrValueC))
+    pattern[i].attrs[0].attr = ID
+    pattern[i].attrs[0].value = entity_id
+    pattern[i].attrs[1].attr = ENT_TYPE
+    pattern[i].attrs[1].value = entity_type
+    pattern[i].nr_attr = 0
     return pattern
 
 
-cdef int match(const Pattern* pattern, const TokenC* token) except -1:
-    cdef int i
-    for i in range(pattern.length):
-        if get_token_attr(token, pattern.spec[i].attr) != pattern.spec[i].value:
-            return False
-    return True
-
-
-cdef int is_final(const Pattern* pattern) except -1:
-    return (pattern + 1).length == 0
-
-
-cdef object get_entity(const Pattern* pattern, const TokenC* tokens, int i):
-    pattern += 1
-    i += 1
-    return (pattern.spec[0].value, i - pattern.spec[1].value, i)
+cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
+    for attr in pattern.attrs[:pattern.nr_attr]:
+        if get_token_attr(token, attr.attr) != attr.value:
+            if pattern.quantifier == ONE:
+                return REJECT
+            elif pattern.quantifier == ZERO:
+                return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE
+            elif pattern.quantifier in (ZERO_ONE, ZERO_PLUS):
+                return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE_ZERO
+            else:
+                return PANIC
+    if pattern.quantifier == ZERO:
+        return REJECT
+    elif pattern.quantifier in (ONE, ZERO_ONE):
+        return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE
+    elif pattern.quantifier == ZERO_PLUS:
+        return REPEAT
+    else:
+        return PANIC
 
 
 def _convert_strings(token_specs, string_store):
-    converted = []
+    # Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
+    operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
+                 '?': (ZERO_ONE,)}
+    tokens = []
+    op = ONE
     for spec in token_specs:
-        converted.append([])
+        token = [] 
+        ops = (ONE,)
         for attr, value in spec.items():
+            if isinstance(attr, basestring) and attr.upper() == 'OP':
+                if value in operators:
+                    ops = operators[value]
+                else:
+                    raise KeyError(
+                        "Unknown operator. Options: %s" % ', '.join(operators.keys()))
             if isinstance(attr, basestring):
                 attr = attrs.IDS.get(attr.upper())
             if isinstance(value, basestring):
@@ -118,8 +158,10 @@ def _convert_strings(token_specs, string_store):
             if isinstance(value, bool):
                 value = int(value)
             if attr is not None:
-                converted[-1].append((attr, value))
-    return converted
+                token.append((attr, value))
+        for op in ops:
+            tokens.append((op, token))
+    return tokens
 
 
 def get_bilou(length):
@@ -150,7 +192,7 @@ def get_bilou(length):
 
 cdef class Matcher:
     cdef Pool mem
-    cdef vector[Pattern*] patterns
+    cdef vector[TokenPatternC*] patterns
     cdef readonly Vocab vocab
     cdef object _patterns
 
@@ -189,15 +231,15 @@ cdef class Matcher:
         # entity
         for spec in specs:
             spec = _convert_strings(spec, self.vocab.strings)
-            self.patterns.push_back(init_pattern(self.mem, spec, etype))
+            self.patterns.push_back(init_pattern(self.mem, spec, entity_key, etype))
 
     def __call__(self, Doc doc, acceptor=None):
-        cdef vector[Pattern*] partials
+        cdef vector[StateC] partials
         cdef int n_partials = 0
         cdef int q = 0
         cdef int i, token_i
         cdef const TokenC* token
-        cdef Pattern* state
+        cdef StateC state
         matches = []
         for token_i in range(doc.length):
             token = &doc.c[token_i]
@@ -205,27 +247,57 @@ cdef class Matcher:
             # Go over the open matches, extending or finalizing if able. Otherwise,
             # we over-write them (q doesn't advance)
             for state in partials:
-                if match(state, token):
-                    if is_final(state):
-                        label, start, end = get_entity(state, token, token_i)
-                        if acceptor is None or acceptor(doc, label, start, end):
-                            matches.append((label, start, end))
-                    else:
-                        partials[q] = state + 1
-                        q += 1
+                action = get_action(state.second, token)
+                while action == ADVANCE_ZERO:
+                    state.second += 1
+                    action = get_action(state.second, token)
+                if action == REPEAT:
+                    # Leave the state in the queue, and advance to next slot
+                    # (i.e. we don't overwrite -- we want to greedily match more
+                    # pattern.
+                    q += 1
+                elif action == REJECT:
+                    pass
+                elif action == ADVANCE:
+                    partials[q].second += 1
+                    q += 1
+                elif action == ACCEPT:
+                    # TODO: What to do about patterns starting with ZERO? Need to
+                    # adjust the start position.
+                    start = state.first
+                    end = token_i+1
+                    ent_id = state.second[1].attrs[0].value
+                    label = state.second[1].attrs[1].value
+                    if acceptor is None or acceptor(doc, ent_id, label, start, end):
+                        matches.append((ent_id, label, start, end))
             partials.resize(q)
             # Check whether we open any new patterns on this token
-            for state in self.patterns:
-                if match(state, token):
-                    if is_final(state):
-                        label, start, end = get_entity(state, token, token_i)
-                        if acceptor is None or acceptor(doc, label, start, end):
-                            matches.append((label, start, end))
-                    else:
-                        partials.push_back(state + 1)
+            for pattern in self.patterns:
+                action = get_action(pattern, token)
+                while action == ADVANCE_ZERO:
+                    pattern += 1
+                    action = get_action(pattern, token)
+                if action == REPEAT:
+                    state.first = token_i
+                    state.second = pattern
+                    partials.push_back(state)
+                elif action == ADVANCE:
+                    # TODO: What to do about patterns starting with ZERO? Need to
+                    # adjust the start position.
+                    state.first = token_i
+                    state.second = pattern + 1
+                    partials.push_back(state)
+                elif action == ACCEPT:
+                    start = token_i
+                    end = token_i+1
+                    ent_id = pattern[1].attrs[0].value
+                    label = pattern[1].attrs[1].value
+                    if acceptor is None or acceptor(doc, ent_id, label, start, end):
+                        matches.append((ent_id, label, start, end))
         seen = set()
         filtered = []
-        for label, start, end in sorted(matches, key=lambda m: (m[1], -(m[1] - m[2]))):
+        for ent_id, label, start, end in sorted(matches,
+                                                key=lambda m: (m[2],-(m[2]-m[3]))):
             if all(i in seen for i in range(start, end)):
                 continue
             else:
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index f7e6b1ec7..ae1cfb434 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -29,6 +29,7 @@ cdef struct LexemeC:
 
 
 cdef struct Entity:
+    hash_t id
     int start
     int end
     int label
@@ -53,4 +54,5 @@ cdef struct TokenC:
     uint32_t r_edge
 
     int ent_iob
-    int ent_type
+    int ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
+    hash_t ent_id
diff --git a/spacy/tests/test_matcher.py b/spacy/tests/test_matcher.py
index 986d8a8bd..9dce3177f 100644
--- a/spacy/tests/test_matcher.py
+++ b/spacy/tests/test_matcher.py
@@ -6,46 +6,85 @@ from spacy.matcher import *
 from spacy.attrs import LOWER
 from spacy.tokens.doc import Doc
 from spacy.vocab import Vocab
+from spacy.en import English
 
 
 @pytest.fixture
-def matcher(EN):
+def matcher():
     patterns = {
-        'Javascript': ['PRODUCT', {}, [[{'ORTH': 'JavaScript'}]]],
+        'JS': ['PRODUCT', {}, [[{'ORTH': 'JavaScript'}]]],
         'GoogleNow':  ['PRODUCT', {}, [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]]],
         'Java':       ['PRODUCT', {}, [[{'LOWER': 'java'}]]],
     }
-    return Matcher(EN.vocab, patterns)
+    return Matcher(Vocab(get_lex_attr=English.default_lex_attrs()), patterns)
 
 
 def test_compile(matcher):
     assert matcher.n_patterns == 3
 
 
-def test_no_match(matcher, EN):
-    tokens = EN('I like cheese')
-    assert matcher(tokens) == []
+def test_no_match(matcher):
+    doc = Doc(matcher.vocab, ['I', 'like', 'cheese', '.'])
+    assert matcher(doc) == []
 
 
-def test_match_start(matcher, EN):
-    tokens = EN('JavaScript is good')
-    assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 0, 1)]
+def test_match_start(matcher):
+    doc = Doc(matcher.vocab, ['JavaScript', 'is', 'good'])
+    assert matcher(doc) == [(matcher.vocab.strings['JS'],
+                             matcher.vocab.strings['PRODUCT'], 0, 1)]
 
 
-def test_match_end(matcher, EN):
-    tokens = EN('I like java')
-    assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 3)]
+def test_match_end(matcher):
+    doc = Doc(matcher.vocab, ['I', 'like', 'java'])
+    assert matcher(doc) == [(doc.vocab.strings['Java'],
+                             doc.vocab.strings['PRODUCT'], 2, 3)]
 
 
-def test_match_middle(matcher, EN):
-    tokens = EN('I like Google Now best')
-    assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 4)]
+def test_match_middle(matcher):
+    doc = Doc(matcher.vocab, ['I', 'like', 'Google', 'Now', 'best'])
+    assert matcher(doc) == [(doc.vocab.strings['GoogleNow'],
+                             doc.vocab.strings['PRODUCT'], 2, 4)]
 
 
-def test_match_multi(matcher, EN):
-    tokens = EN('I like Google Now and java best')
-    assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 4),
-                               (EN.vocab.strings['PRODUCT'], 5, 6)]
+def test_match_multi(matcher):
+    doc = Doc(matcher.vocab, 'I like Google Now and java best'.split())
+    assert matcher(doc) == [(doc.vocab.strings['GoogleNow'],
+                             doc.vocab.strings['PRODUCT'], 2, 4),
+                            (doc.vocab.strings['Java'],
+                             doc.vocab.strings['PRODUCT'], 5, 6)]
+
+def test_match_zero(matcher):
+    matcher.add('Quote', '', {}, [
+        [
+            {'ORTH': '"'},
+            {'OP': '!', 'IS_PUNCT': True},
+            {'OP': '!', 'IS_PUNCT': True},
+            {'ORTH': '"'}
+        ]])
+    doc = Doc(matcher.vocab, 'He said , " some words " ...'.split())
+    assert len(matcher(doc)) == 1
+    doc = Doc(matcher.vocab, 'He said , " some three words " ...'.split())
+    assert len(matcher(doc)) == 0
+    matcher.add('Quote', '', {}, [
+        [
+            {'ORTH': '"'},
+            {'IS_PUNCT': True},
+            {'IS_PUNCT': True},
+            {'IS_PUNCT': True},
+            {'ORTH': '"'}
+        ]])
+    assert len(matcher(doc)) == 0
+
+
+def test_match_zero_plus(matcher):
+    matcher.add('Quote', '', {}, [
+        [
+            {'ORTH': '"'},
+            {'OP': '*', 'IS_PUNCT': False},
+            {'ORTH': '"'}
+        ]])
+    doc = Doc(matcher.vocab, 'He said , " some words " ...'.split())
+    assert len(matcher(doc)) == 1
 
 
 @pytest.mark.models
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 884bb74d8..159074cd4 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -241,6 +241,27 @@ cdef class Span:
             for word in self.rights:
                 yield from word.subtree
 
+    property ent_id:
+        '''An (integer) entity ID. Usually assigned by patterns in the Matcher.'''
+        def __get__(self):
+            return self.root.ent_id
+
+        def __set__(self, hash_t key):
+            # TODO
+            raise NotImplementedError(
+                "Can't yet set ent_id from Span. Vote for this feature on the issue "
+                "tracker: http://github.com/spacy-io/spaCy")
+    property ent_id_:
+        '''A (string) entity ID. Usually assigned by patterns in the Matcher.'''
+        def __get__(self):
+            return self.root.ent_id_
+
+        def __set__(self, hash_t key):
+            # TODO
+            raise NotImplementedError(
+                "Can't yet set ent_id_ from Span. Vote for this feature on the issue "
+                "tracker: http://github.com/spacy-io/spaCy")
+
     property orth_:
         def __get__(self):
             return ''.join([t.string for t in self]).strip()
diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd
index 1706cdc55..aa2f09394 100644
--- a/spacy/tokens/token.pxd
+++ b/spacy/tokens/token.pxd
@@ -5,10 +5,9 @@ from .doc cimport Doc
 
 
 cdef class Token:
-    cdef Vocab vocab
+    cdef readonly Vocab vocab
     cdef TokenC* c
     cdef readonly int i
-    cdef int array_len
     cdef readonly Doc doc
 
     @staticmethod
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 0221a1eb9..9f1a70636 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -58,7 +58,6 @@ cdef class Token:
         self.doc = doc
         self.c = &self.doc.c[offset]
         self.i = offset
-        self.array_len = doc.length
 
     def __len__(self):
         return self.c.lex.length
@@ -410,6 +409,28 @@ cdef class Token:
             iob_strings = ('', 'I', 'O', 'B')
             return iob_strings[self.c.ent_iob]
 
+    property ent_id:
+        '''An (integer) entity ID. Usually assigned by patterns in the Matcher.'''
+        def __get__(self):
+            return self.c.ent.ent_id
+
+        def __set__(self, hash_t key):
+            # TODO
+            raise NotImplementedError(
+                "Can't yet set ent_id from Token. Vote for this feature on the issue "
+                "tracker: http://github.com/spacy-io/spaCy")
+
+    property ent_id_:
+        '''A (string) entity ID. Usually assigned by patterns in the Matcher.'''
+        def __get__(self):
+            return self.vocab.strings[self.c.ent_id]
+
+        def __set__(self, hash_t key):
+            # TODO
+            raise NotImplementedError(
+                "Can't yet set ent_id_ from Token. Vote for this feature on the issue "
+                "tracker: http://github.com/spacy-io/spaCy")
+
     property whitespace_:
         def __get__(self):
             return ' ' if self.c.spacy else ''
@@ -507,3 +528,17 @@ cdef class Token:
 
     property like_email:
         def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
+
+
+
+
+
+doc = nlp('Google Now is a moribund project destined for closure.')
+
+google_now = doc.ents[0] # Span instance
+
+google_now.attrs['category'] == 'TECHNOLOGY'
+
+ent_id = google_now.ent_id
+
+attrs = nlp.matcher.get_attrs(ent_id)