Merge pull request #1999 from explosion/feature/better-faster-matcher

Improved Matcher engine
2025-09-18 10:02:40 +03:00 · 2018-02-22 21:50:05 +01:00 · 2018-02-22 21:50:05 +01:00 · 6b30dbd736
commit 6b30dbd736
parent 8c09850354 331904fa9c
7 changed files with 623 additions and 220 deletions
--- a/spacy/_matcher2_notes.py
+++ b/spacy/_matcher2_notes.py
@ -0,0 +1,251 @@
 import pytest
 class Vocab(object):
    pass
 class Doc(list):
    def __init__(self, vocab, words=None):
        list.__init__(self)
        self.extend([Token(i, w) for i, w in enumerate(words)])
 class Token(object):
    def __init__(self, i, word):
        self.i = i
        self.text = word
 def find_matches(patterns, doc):
    init_states = [(pattern, 0, None) for pattern in patterns]
    curr_states = []
    matches = []
    for token in doc:
        nexts = []
        for state in (curr_states + init_states):
            matches, nexts = transition(state, token, matches, nexts)
        curr_states = nexts
    return matches
 def transition(state, token, matches, nexts):
    action = get_action(state, token)
    is_match, keep_state, advance_state = [bool(int(c)) for c in action]
    pattern, i, start = state
    if start is None:
        start = token.i
    if is_match:
        matches.append((pattern, start, token.i+1))
    if advance_state:
        nexts.append((pattern, i+1, start))
    if keep_state:
        # TODO: This needs to be zero-width :(.
        nexts.append((pattern, i, start))
    return (matches, nexts)
 def get_action(state, token):
    '''We need to consider:
    a) Does the token match the specification? [Yes, No]
    b) What's the quantifier? [1, 0+, ?]
    c) Is this the last specification? [final, non-final]
    We can transition in the following ways:
    a) Do we emit a match?
    b) Do we add a state with (next state, next token)?
    c) Do we add a state with (next state, same token)?
    d) Do we add a state with (same state, next token)?
    We'll code the actions as boolean strings, so 0000 means no to all 4,
    1000 means match but no states added, etc.
    1:
      Yes, final:
        1000
      Yes, non-final:
        0100
      No, final:
        0000
      No, non-final
        0000
    0+:
      Yes, final:
        1001
      Yes, non-final:
        0111
      No, final:
        1000 (note: Don't include last token!)
      No, non-final:
        0010
    ?:
      Yes, final:
        1000
      Yes, non-final:
        0100
      No, final:
        1000 (note: Don't include last token!)
      No, non-final:
        0010
    Problem: If a quantifier is matching, we're adding a lot of open partials
    '''
    is_match = get_is_match(state, token)
    operator = get_operator(state, token)
    is_final = get_is_final(state, token)
    raise NotImplementedError
 def get_is_match(state, token):
    pattern, i, start = state
    is_match = token.text == pattern[i]['spec']
    if pattern[i].get('invert'):
        return not is_match
    else:
        return is_match
 def get_is_final(state, token):
    pattern, i, start = state
    return i == len(pattern)-1
 def get_operator(state, token):
    pattern, i, start = state
    return pattern[i].get('op', '1')
 ########################
 # Tests for get_action #
 ########################
 def test_get_action_simple_match():
    pattern = [{'spec': 'a', 'op': '1'}]
    doc = Doc(Vocab(), words=['a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '100'
 def test_get_action_simple_reject():
    pattern = [{'spec': 'b', 'op': '1'}]
    doc = Doc(Vocab(), words=['a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '000'
 def test_get_action_simple_match_match():
    pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'a', 'op': '1'}]
    doc = Doc(Vocab(), words=['a', 'a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '001'
    state = (pattern, 1, 0)
    action = get_action(state, doc[1])
    assert action == '100'
 def test_get_action_simple_match_reject():
    pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
    doc = Doc(Vocab(), words=['a', 'a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '001'
    state = (pattern, 1, 0)
    action = get_action(state, doc[1])
    assert action == '000'
 def test_get_action_simple_match_reject():
    pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
    doc = Doc(Vocab(), words=['a', 'a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '001'
    state = (pattern, 1, 0)
    action = get_action(state, doc[1])
    assert action == '000'
 def test_get_action_plus_match():
    pattern = [{'spec': 'a', 'op': '1+'}]
    doc = Doc(Vocab(), words=['a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '110'
 def test_get_action_plus_match_match():
    pattern = [{'spec': 'a', 'op': '1+'}]
    doc = Doc(Vocab(), words=['a', 'a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '110'
    state = (pattern, 0, 0)
    action = get_action(state, doc[1])
    assert action == '110'
 ##########################
 # Tests for find_matches #
 ##########################
 def test_find_matches_simple_accept():
    pattern = [{'spec': 'a', 'op': '1'}]
    doc = Doc(Vocab(), words=['a'])
    matches = find_matches([pattern], doc)
    assert matches == [(pattern, 0, 1)]
 def test_find_matches_simple_reject():
    pattern = [{'spec': 'a', 'op': '1'}]
    doc = Doc(Vocab(), words=['b'])
    matches = find_matches([pattern], doc)
    assert matches == []
 def test_find_matches_match_twice():
    pattern = [{'spec': 'a', 'op': '1'}]
    doc = Doc(Vocab(), words=['a', 'a'])
    matches = find_matches([pattern], doc)
    assert matches == [(pattern, 0, 1), (pattern, 1, 2)]
 def test_find_matches_longer_pattern():
    pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
    doc = Doc(Vocab(), words=['a', 'b'])
    matches = find_matches([pattern], doc)
    assert matches == [(pattern, 0, 2)]
 def test_find_matches_two_patterns():
    patterns = [[{'spec': 'a', 'op': '1'}], [{'spec': 'b', 'op': '1'}]]
    doc = Doc(Vocab(), words=['a', 'b'])
    matches = find_matches(patterns, doc)
    assert matches == [(patterns[0], 0, 1), (patterns[1], 1, 2)]
 def test_find_matches_two_patterns_overlap():
    patterns = [[{'spec': 'a'}, {'spec': 'b'}],
                [{'spec': 'b'}, {'spec': 'c'}]]
    doc = Doc(Vocab(), words=['a', 'b', 'c'])
    matches = find_matches(patterns, doc)
    assert matches == [(patterns[0], 0, 2), (patterns[1], 1, 3)]
 def test_find_matches_greedy():
    patterns = [[{'spec': 'a', 'op': '1+'}]]
    doc = Doc(Vocab(), words=['a'])
    matches = find_matches(patterns, doc)
    assert matches == [(patterns[0], 0, 1)]
    doc = Doc(Vocab(), words=['a', 'a'])
    matches = find_matches(patterns, doc)
    assert matches == [(patterns[0], 0, 1), (patterns[0], 0, 2), (patterns[0], 1, 2)]
 def test_find_matches_non_greedy():
    patterns = [[{'spec': 'a', 'op': '0+'}, {'spec': 'b', "op": "1"}]]
    doc = Doc(Vocab(), words=['b'])
    matches = find_matches(patterns, doc)
    assert matches == [(patterns[0], 0, 1)]
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -1,24 +1,18 @@
 # cython: profile=True
 # cython: infer_types=True
-# coding: utf8
+# cython: profile=True
 from __future__ import unicode_literals
 import ujson
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
-from libcpp.pair cimport pair
+from libc.stdint cimport int32_t, uint64_t, uint16_t
 from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
 from murmurhash.mrmr cimport hash64
-from libc.stdint cimport int32_t
+from .typedefs cimport attr_t, hash_t
 from .typedefs cimport attr_t
 from .typedefs cimport hash_t
 from .structs cimport TokenC
-from .tokens.doc cimport Doc, get_token_attr
+from .lexeme cimport attr_id_t
 from .vocab cimport Vocab
-
+from .tokens.doc cimport Doc
 from .tokens.doc cimport get_token_attr
 from .attrs cimport ID, attr_id_t, NULL_ATTR
 from .attrs import IDS
 from .attrs cimport attr_id_t, ID, NULL_ATTR
 from .attrs import FLAG61 as U_ENT
 from .attrs import FLAG60 as B2_ENT
 from .attrs import FLAG59 as B3_ENT
@ -48,29 +42,24 @@ from .attrs import FLAG36 as L9_ENT
 from .attrs import FLAG35 as L10_ENT
-cpdef enum quantifier_t:
+cdef enum action_t:
-    _META
+    REJECT = 0000
-    ONE
+    MATCH = 1000
    ADVANCE = 0100
    RETRY = 0010
    RETRY_EXTEND = 0011
    MATCH_EXTEND = 1001
    MATCH_REJECT = 2000
 cdef enum quantifier_t:
    ZERO
    ZERO_ONE
    ZERO_PLUS
    ONE
    ONE_PLUS
 cdef enum action_t:
    REJECT
    ADVANCE
    REPEAT
    ACCEPT
    ADVANCE_ZERO
    ACCEPT_PREV
    PANIC
 # A "match expression" conists of one or more token patterns
 # Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
 # A state is an (int, pattern pointer) pair, where the int is the start
 # position, and the pattern pointer shows where we're up to
 # in the pattern.
 cdef struct AttrValueC:
    attr_id_t attr
    attr_t value
@ -80,10 +69,231 @@ cdef struct TokenPatternC:
    AttrValueC* attrs
    int32_t nr_attr
    quantifier_t quantifier
    hash_t key
-ctypedef TokenPatternC* TokenPatternC_ptr
+cdef struct ActionC:
-ctypedef pair[int, TokenPatternC_ptr] StateC
+    char emit_match
    char next_state_next_token
    char next_state_same_token
    char same_state_next_token
 cdef struct PatternStateC:
    TokenPatternC* pattern
    int32_t start
    int32_t length
 cdef struct MatchC:
    attr_t pattern_id
    int32_t start
    int32_t length
 cdef find_matches(TokenPatternC** patterns, int n, Doc doc):
    cdef vector[PatternStateC] states
    cdef vector[MatchC] matches
    cdef PatternStateC state
    cdef Pool mem = Pool()
    # TODO: Prefill this with the extra attribute values.
    extra_attrs = <attr_t**>mem.alloc(len(doc), sizeof(attr_t*))
    # Main loop
    cdef int i, j
    for i in range(doc.length):
        for j in range(n):
            states.push_back(PatternStateC(patterns[j], i, 0))
        transition_states(states, matches, &doc.c[i], extra_attrs[i])
    # Handle matches that end in 0-width patterns
    finish_states(matches, states)
    return [(matches[i].pattern_id, matches[i].start, matches[i].start+matches[i].length)
            for i in range(matches.size())]
 cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
        const TokenC* token, const attr_t* extra_attrs) except *:
    cdef int q = 0
    cdef vector[PatternStateC] new_states
    for i in range(states.size()):
        action = get_action(states[i], token, extra_attrs)
        if action == REJECT:
            continue
        state = states[i]
        states[q] = state
        while action in (RETRY, RETRY_EXTEND):
            if action == RETRY_EXTEND:
                new_states.push_back(
                    PatternStateC(pattern=state.pattern, start=state.start,
                                  length=state.length+1))
            states[q].pattern += 1
            action = get_action(states[q], token, extra_attrs)
        if action == REJECT:
            pass
        elif action == ADVANCE:
            states[q].pattern += 1
            states[q].length += 1
            q += 1
        else:
            ent_id = state.pattern[1].attrs.value
            if action == MATCH:
                matches.push_back(
                    MatchC(pattern_id=ent_id, start=state.start,
                            length=state.length+1))
            elif action == MATCH_REJECT:
                matches.push_back(
                    MatchC(pattern_id=ent_id, start=state.start,
                            length=state.length))
            elif action == MATCH_EXTEND:
                matches.push_back(
                    MatchC(pattern_id=ent_id, start=state.start,
                           length=state.length))
                states[q].length += 1
                q += 1
    states.resize(q)
    for i in range(new_states.size()):
        states.push_back(new_states[i])
 cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states) except *:
    '''Handle states that end in zero-width patterns.'''
    cdef PatternStateC state
    for i in range(states.size()):
        state = states[i]
        while get_quantifier(state) in (ZERO_PLUS, ZERO_ONE):
            is_final = get_is_final(state)
            if is_final:
                ent_id = state.pattern[1].attrs.value
                matches.push_back(
                    MatchC(pattern_id=ent_id, start=state.start, length=state.length))
                break
            else:
                state.pattern += 1
 cdef action_t get_action(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil:
    '''We need to consider:
    a) Does the token match the specification? [Yes, No]
    b) What's the quantifier? [1, 0+, ?]
    c) Is this the last specification? [final, non-final]
    We can transition in the following ways:
    a) Do we emit a match?
    b) Do we add a state with (next state, next token)?
    c) Do we add a state with (next state, same token)?
    d) Do we add a state with (same state, next token)?
    We'll code the actions as boolean strings, so 0000 means no to all 4,
    1000 means match but no states added, etc.
    1:
      Yes, final:
        1000
      Yes, non-final:
        0100
      No, final:
        0000
      No, non-final
        0000
    0+:
      Yes, final:
        1001
      Yes, non-final:
        0011
      No, final:
        1000 (note: Don't include last token!)
      No, non-final:
        0010
    ?:
      Yes, final:
        1000
      Yes, non-final:
        0100
      No, final:
        1000 (note: Don't include last token!)
      No, non-final:
        0010
    Possible combinations:  1000, 0100, 0000, 1001, 0011, 0010, 
    We'll name the bits "match", "advance", "retry", "extend"
    REJECT = 0000
    MATCH = 1000
    ADVANCE = 0100
    RETRY = 0010
    MATCH_EXTEND = 1001
    RETRY_EXTEND = 0011
    MATCH_REJECT = 2000 # Match, but don't include last token
    Problem: If a quantifier is matching, we're adding a lot of open partials
    '''
    cdef char is_match
    is_match = get_is_match(state, token, extra_attrs)
    quantifier = get_quantifier(state)
    is_final = get_is_final(state)
    if quantifier == ZERO:
        is_match = not is_match
        quantifier = ONE
    if quantifier == ONE:
      if is_match and is_final:
          # Yes, final: 1000
          return MATCH
      elif is_match and not is_final:
          # Yes, non-final: 0100
          return ADVANCE
      elif not is_match and is_final:
          # No, final: 0000
          return REJECT
      else:
          return REJECT
    elif quantifier == ZERO_PLUS:
      if is_match and is_final:
          # Yes, final: 1001
          return MATCH_EXTEND
      elif is_match and not is_final:
          # Yes, non-final: 0011
          return RETRY_EXTEND
      elif not is_match and is_final:
          # No, final 2000 (note: Don't include last token!)
          return MATCH_REJECT
      else:
          # No, non-final 0010
          return RETRY
    elif quantifier == ZERO_ONE:
      if is_match and is_final:
          # Yes, final: 1000
          return MATCH
      elif is_match and not is_final:
          # Yes, non-final: 0100
          return ADVANCE
      elif not is_match and is_final:
          # No, final 2000 (note: Don't include last token!)
          return MATCH_REJECT
      else:
          # No, non-final 0010
          return RETRY
 cdef char get_is_match(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil:
    spec = state.pattern
    for attr in spec.attrs[:spec.nr_attr]:
        if get_token_attr(token, attr.attr) != attr.value:
            return 0
    else:
        return 1
 cdef char get_is_final(PatternStateC state) nogil:
    if state.pattern[1].attrs[0].attr == ID and state.pattern[1].nr_attr == 0:
        return 1
    else:
        return 0
 cdef char get_quantifier(PatternStateC state) nogil:
    return state.pattern.quantifier
 cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
@ -97,6 +307,7 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
        for j, (attr, value) in enumerate(spec):
            pattern[i].attrs[j].attr = attr
            pattern[i].attrs[j].value = value
        pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0)
    i = len(token_specs)
    pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
    pattern[i].attrs[0].attr = ID
@ -105,48 +316,16 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
    return pattern
-cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
+cdef attr_t get_pattern_key(const TokenPatternC* pattern) nogil:
    while pattern.nr_attr != 0:
        pattern += 1
    id_attr = pattern[0].attrs[0]
    assert id_attr.attr == ID
    return id_attr.value
 cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
    lookahead = &pattern[1]
    for attr in pattern.attrs[:pattern.nr_attr]:
        if get_token_attr(token, attr.attr) != attr.value:
            if pattern.quantifier == ONE:
                return REJECT
            elif pattern.quantifier == ZERO:
                return ACCEPT if lookahead.nr_attr == 0 else ADVANCE
            elif pattern.quantifier in (ZERO_ONE, ZERO_PLUS):
                return ACCEPT_PREV if lookahead.nr_attr == 0 else ADVANCE_ZERO
            else:
                return PANIC
    if pattern.quantifier == ZERO:
        return REJECT
    elif lookahead.nr_attr == 0:
        return ACCEPT
    elif pattern.quantifier in (ONE, ZERO_ONE):
        return ADVANCE
    elif pattern.quantifier == ZERO_PLUS:
        # This is a bandaid over the 'shadowing' problem described here:
        # https://github.com/explosion/spaCy/issues/864
        next_action = get_action(lookahead, token)
        if next_action is REJECT:
            return REPEAT
        else:
            return ADVANCE_ZERO
    else:
        return PANIC
 def _convert_strings(token_specs, string_store):
    # Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
-    operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
+    operators = {'*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
-                 '?': (ZERO_ONE,), '1': (ONE,)}
+                 '?': (ZERO_ONE,), '1': (ONE,), '!': (ZERO,)}
    tokens = []
    op = ONE
    for spec in token_specs:
@ -176,21 +355,6 @@ def _convert_strings(token_specs, string_store):
    return tokens
 def merge_phrase(matcher, doc, i, matches):
    """Callback to merge a phrase on match."""
    ent_id, label, start, end = matches[i]
    span = doc[start:end]
    span.merge(ent_type=label, ent_id=ent_id)
 def unpickle_matcher(vocab, patterns, callbacks):
    matcher = Matcher(vocab)
    for key, specs in patterns.items():
        callback = callbacks.get(key, None)
        matcher.add(key, callback, *specs)
    return matcher
 cdef class Matcher:
    """Match sequences of tokens, based on pattern rules."""
    cdef Pool mem
@ -311,7 +475,7 @@ cdef class Matcher:
        if key not in self._patterns:
            return default
        return (self._callbacks[key], self._patterns[key])
-
+    
    def pipe(self, docs, batch_size=1000, n_threads=2):
        """Match a stream of documents, yielding them in turn.
@ -333,85 +497,9 @@ cdef class Matcher:
            describing the matches. A match tuple describes a span
            `doc[start:end]`. The `label_id` and `key` are both integers.
        """
-        cdef vector[StateC] partials
+        matches = find_matches(&self.patterns[0], self.patterns.size(), doc)
-        cdef int n_partials = 0
+        for i, (key, start, end) in enumerate(matches):
-        cdef int q = 0
+            on_match = self._callbacks.get(key, None)
        cdef int i, token_i
        cdef const TokenC* token
        cdef StateC state
        matches = []
        for token_i in range(doc.length):
            token = &doc.c[token_i]
            q = 0
            # Go over the open matches, extending or finalizing if able.
            # Otherwise, we over-write them (q doesn't advance)
            for state in partials:
                action = get_action(state.second, token)
                if action == PANIC:
                    raise Exception("Error selecting action in matcher")
                while action == ADVANCE_ZERO:
                    state.second += 1
                    action = get_action(state.second, token)
                if action == PANIC:
                    raise Exception("Error selecting action in matcher")
                if action == REPEAT:
                    # Leave the state in the queue, and advance to next slot
                    # (i.e. we don't overwrite -- we want to greedily match
                    # more pattern.
                    q += 1
                elif action == REJECT:
                    pass
                elif action == ADVANCE:
                    partials[q] = state
                    partials[q].second += 1
                    q += 1
                elif action in (ACCEPT, ACCEPT_PREV):
                    # TODO: What to do about patterns starting with ZERO? Need
                    # to adjust the start position.
                    start = state.first
                    end = token_i+1 if action == ACCEPT else token_i
                    ent_id = state.second[1].attrs[0].value
                    label = state.second[1].attrs[1].value
                    matches.append((ent_id, start, end))
            partials.resize(q)
            # Check whether we open any new patterns on this token
            for pattern in self.patterns:
                action = get_action(pattern, token)
                if action == PANIC:
                    raise Exception("Error selecting action in matcher")
                while action == ADVANCE_ZERO:
                    pattern += 1
                    action = get_action(pattern, token)
                if action == REPEAT:
                    state.first = token_i
                    state.second = pattern
                    partials.push_back(state)
                elif action == ADVANCE:
                    # TODO: What to do about patterns starting with ZERO? Need
                    # to adjust the start position.
                    state.first = token_i
                    state.second = pattern + 1
                    partials.push_back(state)
                elif action in (ACCEPT, ACCEPT_PREV):
                    start = token_i
                    end = token_i+1 if action == ACCEPT else token_i
                    ent_id = pattern[1].attrs[0].value
                    label = pattern[1].attrs[1].value
                    matches.append((ent_id, start, end))
        # Look for open patterns that are actually satisfied
        for state in partials:
            while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
                state.second += 1
                if state.second.nr_attr == 0:
                    start = state.first
                    end = len(doc)
                    ent_id = state.second.attrs[0].value
                    label = state.second.attrs[0].value
                    matches.append((ent_id, start, end))
        for i, (ent_id, start, end) in enumerate(matches):
            on_match = self._callbacks.get(ent_id)
            if on_match is not None:
                on_match(self, doc, i, matches)
        return matches
@ -423,31 +511,37 @@ cdef class Matcher:
            return key
 def unpickle_matcher(vocab, patterns, callbacks):
    matcher = Matcher(vocab)
    for key, specs in patterns.items():
        callback = callbacks.get(key, None)
        matcher.add(key, callback, *specs)
    return matcher
 def _get_longest_matches(matches):
    '''Filter out matches that have a longer equivalent.'''
    longest_matches = {}
    for pattern_id, start, end in matches:
        key = (pattern_id, start)
        length = end-start
        if key not in longest_matches or length > longest_matches[key]:
            longest_matches[key] = length
    return [(pattern_id, start, start+length)
              for (pattern_id, start), length in longest_matches.items()]
 def get_bilou(length):
-    if length == 1:
+    if length == 0:
        raise ValueError("Length must be >= 1")
    elif length == 1:
        return [U_ENT]
    elif length == 2:
        return [B2_ENT, L2_ENT]
    elif length == 3:
        return [B3_ENT, I3_ENT, L3_ENT]
    elif length == 4:
        return [B4_ENT, I4_ENT, I4_ENT, L4_ENT]
    elif length == 5:
        return [B5_ENT, I5_ENT, I5_ENT, I5_ENT, L5_ENT]
    elif length == 6:
        return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT]
    elif length == 7:
        return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT]
    elif length == 8:
        return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
    elif length == 9:
        return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT,
                L9_ENT]
    elif length == 10:
        return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
                I10_ENT, I10_ENT, L10_ENT]
    else:
-        raise ValueError("Max length currently 10 for phrase matching")
+        return [B4_ENT, I4_ENT] + [I4_ENT] * (length-3) + [L4_ENT]
 cdef class PhraseMatcher:
@ -456,21 +550,21 @@ cdef class PhraseMatcher:
    cdef Matcher matcher
    cdef PreshMap phrase_ids
    cdef int max_length
    cdef attr_t* _phrase_key
    cdef public object _callbacks
    cdef public object _patterns
    def __init__(self, Vocab vocab, max_length=10):
        self.mem = Pool()
        self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t))
        self.max_length = max_length
        self.vocab = vocab
        self.matcher = Matcher(self.vocab)
        self.phrase_ids = PreshMap()
-        abstract_patterns = []
+        abstract_patterns = [
-        for length in range(1, max_length):
+            [{U_ENT: True}],
-            abstract_patterns.append([{tag: True}
+            [{B2_ENT: True}, {L2_ENT: True}],
-                                      for tag in get_bilou(length)])
+            [{B3_ENT: True}, {I3_ENT: True}, {L3_ENT: True}],
            [{B4_ENT: True}, {I4_ENT: True}, {I4_ENT: True, "OP": "+"}, {L4_ENT: True}],
        ]
        self.matcher.add('Candidate', None, *abstract_patterns)
        self._callbacks = {}
@ -504,29 +598,24 @@ cdef class PhraseMatcher:
        *docs (Doc): `Doc` objects representing match patterns.
        """
        cdef Doc doc
        for doc in docs:
            if len(doc) >= self.max_length:
                msg = (
                    "Pattern length (%d) >= phrase_matcher.max_length (%d). "
                    "Length can be set on initialization, up to 10."
                )
                raise ValueError(msg % (len(doc), self.max_length))
        cdef hash_t ent_id = self.matcher._normalize_key(key)
        self._callbacks[ent_id] = on_match
        cdef int length
        cdef int i
        cdef hash_t phrase_hash
        cdef Pool mem = Pool()
        for doc in docs:
            length = doc.length
            if length == 0:
                continue
            tags = get_bilou(length)
-            for i in range(self.max_length):
+            phrase_key = <attr_t*>mem.alloc(length, sizeof(attr_t))
                self._phrase_key[i] = 0
            for i, tag in enumerate(tags):
                lexeme = self.vocab[doc.c[i].lex.orth]
                lexeme.set_flag(tag, True)
-                self._phrase_key[i] = lexeme.orth
+                phrase_key[i] = lexeme.orth
-            phrase_hash = hash64(self._phrase_key,
+            phrase_hash = hash64(phrase_key,
-                                 self.max_length * sizeof(attr_t), 0)
+                                 length * sizeof(attr_t), 0)
            self.phrase_ids.set(phrase_hash, <void*>ent_id)
    def __call__(self, Doc doc):
@ -580,14 +669,13 @@ cdef class PhraseMatcher:
                    yield doc
    def accept_match(self, Doc doc, int start, int end):
        assert (end - start) < self.max_length
        cdef int i, j
-        for i in range(self.max_length):
+        cdef Pool mem = Pool()
-            self._phrase_key[i] = 0
+        phrase_key = <attr_t*>mem.alloc(end-start, sizeof(attr_t))
        for i, j in enumerate(range(start, end)):
-            self._phrase_key[i] = doc.c[j].lex.orth
+            phrase_key[i] = doc.c[j].lex.orth
-        cdef hash_t key = hash64(self._phrase_key,
+        cdef hash_t key = hash64(phrase_key,
-                                 self.max_length * sizeof(attr_t), 0)
+                                 (end-start) * sizeof(attr_t), 0)
        ent_id = <hash_t>self.phrase_ids.get(key)
        if ent_id == 0:
            return None
--- a/spacy/tests/regression/test_issue1450.py
+++ b/spacy/tests/regression/test_issue1450.py
@ -13,8 +13,8 @@ from ...vocab import Vocab
        ('a b', 0, 2),
        ('a c', 0, 1),
        ('a b c', 0, 2),
-        ('a b b c', 0, 2),
+        ('a b b c', 0, 3),
-        ('a b b', 0, 2),
+        ('a b b', 0, 3),
    ]
 )
 def test_issue1450_matcher_end_zero_plus(string, start, end):
@ -54,5 +54,6 @@ def test_issue1450_matcher_end_zero_plus(string, start, end):
    if start is None or end is None:
        assert matches == []
-    assert matches[0][1] == start
+    print(matches)
-    assert matches[0][2] == end
+    assert matches[-1][1] == start
    assert matches[-1][2] == end
--- a/spacy/tests/regression/test_issue1855.py
+++ b/spacy/tests/regression/test_issue1855.py
@ -0,0 +1,65 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 from ...matcher import Matcher
 import pytest
 pattern1	= [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}]
 pattern2	= [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}]
 pattern3	= [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}]
 pattern4	= [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
 pattern5 	= [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
 re_pattern1	= 'AA*'
 re_pattern2 = 'A*A'
 re_pattern3	= 'AA'
 re_pattern4	= 'BA*B'
 re_pattern5	= 'B*A*B'
@pytest.fixture
 def text():
 	return "(ABBAAAAAB)."
@pytest.fixture
 def doc(en_tokenizer,text):
    doc = en_tokenizer(' '.join(text))
    return doc
@pytest.mark.xfail
@pytest.mark.parametrize('pattern,re_pattern',[
 	(pattern1,re_pattern1),
 	(pattern2,re_pattern2),
 	(pattern3,re_pattern3),
 	(pattern4,re_pattern4),
 	(pattern5,re_pattern5)])
 def test_greedy_matching(doc,text,pattern,re_pattern):
 	"""
 	Test that the greedy matching behavior of the * op
 	is consistant with other re implementations
 	"""
 	matcher = Matcher(doc.vocab)
 	matcher.add(re_pattern,None,pattern)
 	matches = matcher(doc)
 	re_matches = [m.span() for m in re.finditer(re_pattern,text)]
 	for match,re_match in zip(matches,re_matches):
 		assert match[1:]==re_match
@pytest.mark.xfail
@pytest.mark.parametrize('pattern,re_pattern',[
 	(pattern1,re_pattern1),
 	(pattern2,re_pattern2),
 	(pattern3,re_pattern3),
 	(pattern4,re_pattern4),
 	(pattern5,re_pattern5)])
 def test_match_consuming(doc,text,pattern,re_pattern):
 	"""
 	Test that matcher.__call__ consumes tokens on a match
 	similar to re.findall
 	"""
 	matcher = Matcher(doc.vocab)
 	matcher.add(re_pattern,None,pattern)
 	matches = matcher(doc)
 	re_matches = [m.span() for m in re.finditer(re_pattern,text)]
 	assert len(matches)==len(re_matches)
--- a/spacy/tests/regression/test_issue1945.py
+++ b/spacy/tests/regression/test_issue1945.py
@ -6,7 +6,6 @@ from ...vocab import Vocab
 from ...tokens import Doc
 from ...matcher import Matcher
@pytest.mark.xfail
 def test_issue1945():
    text = "a a a"
    matcher = Matcher(Vocab())
--- a/spacy/tests/regression/test_issue850.py
+++ b/spacy/tests/regression/test_issue850.py
@ -22,10 +22,9 @@ def test_basic_case():
    assert end == 4
@pytest.mark.xfail
 def test_issue850():
-    """The problem here is that the variable-length pattern matches the
+    """The variable-length pattern matches the
-    succeeding token. We then don't handle the ambiguity correctly."""
+    succeeding token. Check we handle the ambiguity correctly."""
    matcher = Matcher(Vocab(
                lex_attr_getters={LOWER: lambda string: string.lower()}))
    IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
--- a/spacy/tests/test_matcher.py
+++ b/spacy/tests/test_matcher.py
@ -186,6 +186,7 @@ def test_matcher_match_zero_plus(matcher):
    pattern = [{'ORTH': '"'},
               {'OP': '*', 'IS_PUNCT': False},
               {'ORTH': '"'}]
    matcher = Matcher(matcher.vocab)
    matcher.add('Quote', None, pattern)
    doc = get_doc(matcher.vocab, words)
    assert len(matcher(doc)) == 1
@ -252,9 +253,8 @@ def test_matcher_end_zero_plus(matcher):
    )
    nlp = lambda string: Doc(matcher.vocab, words=string.split())
    assert len(matcher(nlp(u'a'))) == 1
-    assert len(matcher(nlp(u'a b'))) == 1
+    assert len(matcher(nlp(u'a b'))) == 2
    assert len(matcher(nlp(u'a b'))) == 1
    assert len(matcher(nlp(u'a c'))) == 1
-    assert len(matcher(nlp(u'a b c'))) == 1
+    assert len(matcher(nlp(u'a b c'))) == 2
-    assert len(matcher(nlp(u'a b b c'))) == 1
+    assert len(matcher(nlp(u'a b b c'))) == 3
-    assert len(matcher(nlp(u'a b b'))) == 1
+    assert len(matcher(nlp(u'a b b'))) == 3