From fae5c0dc1836257be6e258ae8a0e75096dce3469 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 12 Feb 2018 10:17:43 +0100 Subject: [PATCH 01/23] Work on matcher2 --- spacy/matcher2.pyx | 399 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 399 insertions(+) create mode 100644 spacy/matcher2.pyx diff --git a/spacy/matcher2.pyx b/spacy/matcher2.pyx new file mode 100644 index 000000000..ff90e644d --- /dev/null +++ b/spacy/matcher2.pyx @@ -0,0 +1,399 @@ +# cython: infer_types=True +from libcpp.vector cimport vector +from libc.stdint cimport int32_t, uint64_t +from preshed.maps cimport PreshMap +from cymem.cymem cimport Pool +from murmurhash.mrmr cimport hash64 +from .typedefs cimport attr_t, hash_t +from .structs cimport TokenC +from .lexeme cimport attr_id_t +from .vocab cimport Vocab +from .tokens.doc cimport Doc +from .tokens.doc cimport get_token_attr +from .attrs cimport ID, attr_id_t, NULL_ATTR +from .attrs import IDS + + +cdef enum quantifier_t: + ZERO + ZERO_ONE + ZERO_PLUS + ONE + ONE_PLUS + + +cdef struct AttrValueC: + attr_id_t attr + attr_t value + + +cdef struct TokenPatternC: + AttrValueC* attrs + int32_t nr_attr + quantifier_t quantifier + hash_t key + + +cdef struct ActionC: + char is_match + char keep_state + char advance_state + + +cdef struct PatternStateC: + TokenPatternC* state + int32_t pattern_id + int32_t start + ActionC last_action + + +cdef struct MatchC: + int32_t pattern_id + int32_t start + int32_t end + + +cdef find_matches(TokenPatternC** patterns, int n, Doc doc): + cdef vector[PatternStateC] init_states + cdef ActionC null_action = ActionC(-1, -1, -1) + for i in range(n): + init_states.push_back(PatternStateC(patterns[i], i, -1, last_action=null_action)) + cdef vector[PatternStateC] curr_states + cdef vector[PatternStateC] nexts + cdef vector[MatchC] matches + cdef PreshMap cache = PreshMap() + cdef Pool mem = Pool() + # TODO: Prefill this with the extra attribute values. + extra_attrs = mem.alloc(len(doc), sizeof(attr_t*)) + for i in range(doc.length): + nexts.clear() + for j in range(curr_states.size()): + action = get_action(curr_states[j], &doc.c[i], extra_attrs[i], cache) + transition(matches, nexts, + action, curr_states[j], i) + for j in range(init_states.size()): + action = get_action(init_states[j], &doc.c[i], extra_attrs[i], cache) + transition(matches, nexts, + action, init_states[j], i) + nexts, curr_states = curr_states, nexts + # Filter out matches that have a longer equivalent. + longest_matches = {} + for i in range(matches.size()): + key = matches[i].pattern_id, matches[i].start + length = matches[i].end - matches[i].start + if key not in longest_matches or length > longest_matches[key]: + longest_matches[key] = length + return [(pattern_id, start, length-start) + for (pattern_id, start), length in longest_matches] + + +cdef void transition(vector[MatchC]& matches, vector[PatternStateC]& nexts, + ActionC action, PatternStateC state, int token) except *: + if state.start == -1: + state.start = token + if action.is_match: + matches.push_back( + MatchC(pattern_id=state.pattern_id, start=state.start, end=token+1)) + if action.keep_state: + nexts.push_back(PatternStateC(pattern_id=pattern_id, + start=state.start, state=state.state, last_action=action)) + if action.advance_state: + nexts.push_back(PatternStateC(pattern_id=pattern_id, + start=state.start, state=state.state+1, last_action=action)) + + +cdef ActionC get_action(PatternStateC state, const TokenC* token, const attr_t* extra_attrs, + PreshMap cache) except *: + '''We need to consider: + + a) Does the token match the specification? [Yes, No] + b) What's the quantifier? [1, 0+, ?] + c) Is this the last specification? [final, non-final] + + We therefore have 12 cases to consider. For each case, we need to know + whether to emit a match, whether to keep the current state in the partials, + and whether to add an advanced state to the partials. + + We therefore have eight possible results for these three booleans, which + we'll code as 000, 001 etc. + + 1: + - Match, final: + 100 + - Match, non-final: + 001 + - No match: + 000 + 0+: + - Match, final: + 100 + - Match, non-final: + 011 + - Non-match, final: + 100 + - Non-match, non-final: + 010 + + Problem: If a quantifier is matching, we're adding a lot of open partials + Question: Is it worth doing a lookahead, to see if we add? + ''' + cached_match = cache.get(state.state.key) + cdef char is_match + if cached_match == 0: + is_match = get_is_match(state, token, extra_attrs) + cached_match = is_match + 1 + cache.set(state.state.key, cached_match) + elif cached_match == 1: + is_match = 0 + else: + is_match = 1 + quantifier = get_quantifier(state, token) + is_final = get_is_final(state, token) + if quantifier == ONE: + if not is_match: + return ActionC(is_match=0, keep_state=0, advance_state=0) + elif is_final: + return ActionC(is_match=1, keep_state=0, advance_state=0) + else: + return ActionC(is_match=0, keep_state=0, advance_state=1) + elif quantifier == ZERO_PLUS: + if is_final: + return ActionC(is_match=1, keep_state=0, advance_state=0) + elif is_match: + return ActionC(is_match=0, keep_state=1, advance_state=1) + else: + return ActionC(is_match=0, keep_state=1, advance_state=0) + elif quantifier == ZERO_ONE: + if is_final: + return ActionC(is_match=1, keep_state=0, advance_state=0) + elif is_match: + if state.last_action.keep_state: + return ActionC(is_match=0, keep_state=0, advance_state=1) + else: + return ActionC(is_match=0, keep_state=1, advance_state=1) + else: + print(quantifier, is_match, is_final) + raise ValueError + + +cdef char get_is_match(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil: + spec = state.state + for attr in spec.attrs[:spec.nr_attr]: + if get_token_attr(token, attr.attr) != attr.value: + return 0 + else: + return 1 + + +cdef char get_is_final(PatternStateC state, const TokenC* token) nogil: + if state.state[1].attrs[0].attr == ID and state.state[1].nr_attr == 0: + return 1 + else: + return 0 + + +cdef char get_quantifier(PatternStateC state, const TokenC* token) nogil: + return state.state.quantifier + + +cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, + object token_specs) except NULL: + pattern = mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC)) + cdef int i + for i, (quantifier, spec) in enumerate(token_specs): + pattern[i].quantifier = quantifier + pattern[i].attrs = mem.alloc(len(spec), sizeof(AttrValueC)) + pattern[i].nr_attr = len(spec) + for j, (attr, value) in enumerate(spec): + pattern[i].attrs[j].attr = attr + pattern[i].attrs[j].value = value + pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0) + i = len(token_specs) + pattern[i].attrs = mem.alloc(2, sizeof(AttrValueC)) + pattern[i].attrs[0].attr = ID + pattern[i].attrs[0].value = entity_id + pattern[i].nr_attr = 0 + return pattern + + +cdef attr_t get_pattern_key(const TokenPatternC* pattern) nogil: + while pattern.nr_attr != 0: + pattern += 1 + id_attr = pattern[0].attrs[0] + return id_attr.value + +def _convert_strings(token_specs, string_store): + # Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS + operators = {'*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS), + '?': (ZERO_ONE,), '1': (ONE,)} + tokens = [] + op = ONE + for spec in token_specs: + if not spec: + # Signifier for 'any token' + tokens.append((ONE, [(NULL_ATTR, 0)])) + continue + token = [] + ops = (ONE,) + for attr, value in spec.items(): + if isinstance(attr, basestring) and attr.upper() == 'OP': + if value in operators: + ops = operators[value] + else: + msg = "Unknown operator '%s'. Options: %s" + raise KeyError(msg % (value, ', '.join(operators.keys()))) + if isinstance(attr, basestring): + attr = IDS.get(attr.upper()) + if isinstance(value, basestring): + value = string_store.add(value) + if isinstance(value, bool): + value = int(value) + if attr is not None: + token.append((attr, value)) + for op in ops: + tokens.append((op, token)) + return tokens + + +cdef class Matcher: + """Match sequences of tokens, based on pattern rules.""" + cdef Pool mem + cdef vector[TokenPatternC*] patterns + cdef readonly Vocab vocab + cdef public object _patterns + cdef public object _entities + cdef public object _callbacks + + def __init__(self, vocab): + """Create the Matcher. + + vocab (Vocab): The vocabulary object, which must be shared with the + documents the matcher will operate on. + RETURNS (Matcher): The newly constructed object. + """ + self._patterns = {} + self._entities = {} + self._callbacks = {} + self.vocab = vocab + self.mem = Pool() + + def __reduce__(self): + data = (self.vocab, self._patterns, self._callbacks) + return (unpickle_matcher, data, None, None) + + def __len__(self): + """Get the number of rules added to the matcher. Note that this only + returns the number of rules (identical with the number of IDs), not the + number of individual patterns. + + RETURNS (int): The number of rules. + """ + return len(self._patterns) + + def __contains__(self, key): + """Check whether the matcher contains rules for a match ID. + + key (unicode): The match ID. + RETURNS (bool): Whether the matcher contains rules for this match ID. + """ + return self._normalize_key(key) in self._patterns + + def add(self, key, on_match, *patterns): + """Add a match-rule to the matcher. A match-rule consists of: an ID + key, an on_match callback, and one or more patterns. + + If the key exists, the patterns are appended to the previous ones, and + the previous on_match callback is replaced. The `on_match` callback + will receive the arguments `(matcher, doc, i, matches)`. You can also + set `on_match` to `None` to not perform any actions. + + A pattern consists of one or more `token_specs`, where a `token_spec` + is a dictionary mapping attribute IDs to values, and optionally a + quantifier operator under the key "op". The available quantifiers are: + + '!': Negate the pattern, by requiring it to match exactly 0 times. + '?': Make the pattern optional, by allowing it to match 0 or 1 times. + '+': Require the pattern to match 1 or more times. + '*': Allow the pattern to zero or more times. + + The + and * operators are usually interpretted "greedily", i.e. longer + matches are returned where possible. However, if you specify two '+' + and '*' patterns in a row and their matches overlap, the first + operator will behave non-greedily. This quirk in the semantics makes + the matcher more efficient, by avoiding the need for back-tracking. + + key (unicode): The match ID. + on_match (callable): Callback executed on match. + *patterns (list): List of token descritions. + """ + for pattern in patterns: + if len(pattern) == 0: + msg = ("Cannot add pattern for zero tokens to matcher.\n" + "key: {key}\n") + raise ValueError(msg.format(key=key)) + key = self._normalize_key(key) + for pattern in patterns: + specs = _convert_strings(pattern, self.vocab.strings) + self.patterns.push_back(init_pattern(self.mem, key, specs)) + self._patterns.setdefault(key, []) + self._callbacks[key] = on_match + self._patterns[key].extend(patterns) + + def remove(self, key): + """Remove a rule from the matcher. A KeyError is raised if the key does + not exist. + + key (unicode): The ID of the match rule. + """ + key = self._normalize_key(key) + self._patterns.pop(key) + self._callbacks.pop(key) + cdef int i = 0 + while i < self.patterns.size(): + pattern_key = get_pattern_key(self.patterns.at(i)) + if pattern_key == key: + self.patterns.erase(self.patterns.begin()+i) + else: + i += 1 + + def has_key(self, key): + """Check whether the matcher has a rule with a given key. + + key (string or int): The key to check. + RETURNS (bool): Whether the matcher has the rule. + """ + key = self._normalize_key(key) + return key in self._patterns + + def get(self, key, default=None): + """Retrieve the pattern stored for a key. + + key (unicode or int): The key to retrieve. + RETURNS (tuple): The rule, as an (on_match, patterns) tuple. + """ + key = self._normalize_key(key) + if key not in self._patterns: + return default + return (self._callbacks[key], self._patterns[key]) + + def __call__(self, Doc doc): + """Find all token sequences matching the supplied pattern. + + doc (Doc): The document to match over. + RETURNS (list): A list of `(key, start, end)` tuples, + describing the matches. A match tuple describes a span + `doc[start:end]`. The `label_id` and `key` are both integers. + """ + matches = find_matches(&self.patterns[0], self.patterns.size(), doc) + return matches + + +def unpickle_matcher(vocab, patterns, callbacks): + matcher = Matcher(vocab) + for key, specs in patterns.items(): + callback = callbacks.get(key, None) + matcher.add(key, callback, *specs) + return matcher + + + From 0d3262a9f3c3419770b173df91fc06986c6b0ddd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 12 Feb 2018 10:18:04 +0100 Subject: [PATCH 02/23] Compile matcher2 --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 7c26a7491..db20f8ee6 100755 --- a/setup.py +++ b/setup.py @@ -38,6 +38,7 @@ MOD_NAMES = [ 'spacy.tokens.span', 'spacy.tokens.token', 'spacy.matcher', + 'spacy.matcher2', 'spacy.syntax.ner', 'spacy.symbols', 'spacy.vectors', From d34c7326350edc3223ba9327b62d2d764328d11b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 12 Feb 2018 10:19:29 +0100 Subject: [PATCH 03/23] Add Python notes for rethinking matcher --- spacy/_matcher2_notes.py | 251 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 251 insertions(+) create mode 100644 spacy/_matcher2_notes.py diff --git a/spacy/_matcher2_notes.py b/spacy/_matcher2_notes.py new file mode 100644 index 000000000..56fd4ca15 --- /dev/null +++ b/spacy/_matcher2_notes.py @@ -0,0 +1,251 @@ +import pytest + + +class Vocab(object): + pass + + +class Doc(list): + def __init__(self, vocab, words=None): + list.__init__(self) + self.extend([Token(i, w) for i, w in enumerate(words)]) + + +class Token(object): + def __init__(self, i, word): + self.i = i + self.text = word + + +def find_matches(patterns, doc): + init_states = [(pattern, 0, None) for pattern in patterns] + curr_states = [] + matches = [] + for token in doc: + nexts = [] + for state in (curr_states + init_states): + matches, nexts = transition(state, token, matches, nexts) + curr_states = nexts + return matches + + +def transition(state, token, matches, nexts): + action = get_action(state, token) + is_match, keep_state, advance_state = [bool(int(c)) for c in action] + pattern, i, start = state + if start is None: + start = token.i + if is_match: + matches.append((pattern, start, token.i+1)) + if keep_state: + nexts.append((pattern, i, start)) + if advance_state: + nexts.append((pattern, i+1, start)) + return (matches, nexts) + + +def get_action(state, token): + '''We need to consider: + + a) Does the token match the specification? [Yes, No] + b) What's the quantifier? [1, 1+, 0+] + c) Is this the last specification? [final, non-final] + + We therefore have 12 cases to consider. For each case, we need to know + whether to emit a match, whether to keep the current state in the partials, + and whether to add an advanced state to the partials. + + We therefore have eight possible results for these three booleans, which + we'll code as 000, 001 etc. + + - No match: + 000 + - Match, final: + 1: 100 + 1+: 110 + - Match, non-final: + 1: 001 + 1+: 011 + + Problem: If a quantifier is matching, we're adding a lot of open partials + ''' + is_match = get_is_match(state, token) + operator = get_operator(state, token) + is_final = get_is_final(state, token) + if operator == '1': + if not is_match: + return '000' + elif is_final: + return '100' + else: + return '001' + elif operator == '1+': + if not is_match: + return '000' + if is_final: + return '110' + else: + return '011' + elif operator == '0+': + if is_final: + return '100' + elif is_match: + return '011' + else: + return '010' + else: + print(operator, is_match, is_final) + raise ValueError + + +def get_is_match(state, token): + pattern, i, start = state + is_match = token.text == pattern[i]['spec'] + if pattern[i].get('invert'): + return not is_match + else: + return is_match + +def get_is_final(state, token): + pattern, i, start = state + return i == len(pattern)-1 + +def get_operator(state, token): + pattern, i, start = state + return pattern[i].get('op', '1') + + +######################## +# Tests for get_action # +######################## + + +def test_get_action_simple_match(): + pattern = [{'spec': 'a', 'op': '1'}] + doc = Doc(Vocab(), words=['a']) + state = (pattern, 0, None) + action = get_action(state, doc[0]) + assert action == '100' + + +def test_get_action_simple_reject(): + pattern = [{'spec': 'b', 'op': '1'}] + doc = Doc(Vocab(), words=['a']) + state = (pattern, 0, None) + action = get_action(state, doc[0]) + assert action == '000' + + +def test_get_action_simple_match_match(): + pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'a', 'op': '1'}] + doc = Doc(Vocab(), words=['a', 'a']) + state = (pattern, 0, None) + action = get_action(state, doc[0]) + assert action == '001' + state = (pattern, 1, 0) + action = get_action(state, doc[1]) + assert action == '100' + + +def test_get_action_simple_match_reject(): + pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}] + doc = Doc(Vocab(), words=['a', 'a']) + state = (pattern, 0, None) + action = get_action(state, doc[0]) + assert action == '001' + state = (pattern, 1, 0) + action = get_action(state, doc[1]) + assert action == '000' + + +def test_get_action_simple_match_reject(): + pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}] + doc = Doc(Vocab(), words=['a', 'a']) + state = (pattern, 0, None) + action = get_action(state, doc[0]) + assert action == '001' + state = (pattern, 1, 0) + action = get_action(state, doc[1]) + assert action == '000' + + +def test_get_action_plus_match(): + pattern = [{'spec': 'a', 'op': '1+'}] + doc = Doc(Vocab(), words=['a']) + state = (pattern, 0, None) + action = get_action(state, doc[0]) + assert action == '110' + + +def test_get_action_plus_match_match(): + pattern = [{'spec': 'a', 'op': '1+'}] + doc = Doc(Vocab(), words=['a', 'a']) + state = (pattern, 0, None) + action = get_action(state, doc[0]) + assert action == '110' + state = (pattern, 0, 0) + action = get_action(state, doc[1]) + assert action == '110' + + +########################## +# Tests for find_matches # +########################## + +def test_find_matches_simple_accept(): + pattern = [{'spec': 'a', 'op': '1'}] + doc = Doc(Vocab(), words=['a']) + matches = find_matches([pattern], doc) + assert matches == [(pattern, 0, 1)] + + +def test_find_matches_simple_reject(): + pattern = [{'spec': 'a', 'op': '1'}] + doc = Doc(Vocab(), words=['b']) + matches = find_matches([pattern], doc) + assert matches == [] + + +def test_find_matches_match_twice(): + pattern = [{'spec': 'a', 'op': '1'}] + doc = Doc(Vocab(), words=['a', 'a']) + matches = find_matches([pattern], doc) + assert matches == [(pattern, 0, 1), (pattern, 1, 2)] + + +def test_find_matches_longer_pattern(): + pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}] + doc = Doc(Vocab(), words=['a', 'b']) + matches = find_matches([pattern], doc) + assert matches == [(pattern, 0, 2)] + + +def test_find_matches_two_patterns(): + patterns = [[{'spec': 'a', 'op': '1'}], [{'spec': 'b', 'op': '1'}]] + doc = Doc(Vocab(), words=['a', 'b']) + matches = find_matches(patterns, doc) + assert matches == [(patterns[0], 0, 1), (patterns[1], 1, 2)] + + +def test_find_matches_two_patterns_overlap(): + patterns = [[{'spec': 'a'}, {'spec': 'b'}], + [{'spec': 'b'}, {'spec': 'c'}]] + doc = Doc(Vocab(), words=['a', 'b', 'c']) + matches = find_matches(patterns, doc) + assert matches == [(patterns[0], 0, 2), (patterns[1], 1, 3)] + + +def test_find_matches_greedy(): + patterns = [[{'spec': 'a', 'op': '1+'}]] + doc = Doc(Vocab(), words=['a']) + matches = find_matches(patterns, doc) + assert matches == [(patterns[0], 0, 1)] + doc = Doc(Vocab(), words=['a', 'a']) + matches = find_matches(patterns, doc) + assert matches == [(patterns[0], 0, 1), (patterns[0], 0, 2), (patterns[0], 1, 2)] + +def test_find_matches_non_greedy(): + patterns = [[{'spec': 'a', 'op': '0+'}, {'spec': 'b'}]] + doc = Doc(Vocab(), words=['b']) + matches = find_matches(patterns, doc) + assert matches == [(patterns[0], 0, 1)] From b00326a7fe474fd8bbc05f0c1026c0e08437f557 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 12 Feb 2018 12:05:54 +0100 Subject: [PATCH 04/23] Move pattern_id out of TokenPattern --- spacy/matcher2.pyx | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/spacy/matcher2.pyx b/spacy/matcher2.pyx index ff90e644d..3bab60ede 100644 --- a/spacy/matcher2.pyx +++ b/spacy/matcher2.pyx @@ -42,13 +42,12 @@ cdef struct ActionC: cdef struct PatternStateC: TokenPatternC* state - int32_t pattern_id int32_t start ActionC last_action cdef struct MatchC: - int32_t pattern_id + attr_t pattern_id int32_t start int32_t end @@ -57,15 +56,16 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc): cdef vector[PatternStateC] init_states cdef ActionC null_action = ActionC(-1, -1, -1) for i in range(n): - init_states.push_back(PatternStateC(patterns[i], i, -1, last_action=null_action)) + init_states.push_back(PatternStateC(patterns[i], -1, last_action=null_action)) cdef vector[PatternStateC] curr_states cdef vector[PatternStateC] nexts cdef vector[MatchC] matches - cdef PreshMap cache = PreshMap() + cdef PreshMap cache cdef Pool mem = Pool() # TODO: Prefill this with the extra attribute values. extra_attrs = mem.alloc(len(doc), sizeof(attr_t*)) for i in range(doc.length): + cache = PreshMap() nexts.clear() for j in range(curr_states.size()): action = get_action(curr_states[j], &doc.c[i], extra_attrs[i], cache) @@ -79,12 +79,13 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc): # Filter out matches that have a longer equivalent. longest_matches = {} for i in range(matches.size()): - key = matches[i].pattern_id, matches[i].start + key = (matches[i].pattern_id, matches[i].start) length = matches[i].end - matches[i].start if key not in longest_matches or length > longest_matches[key]: longest_matches[key] = length - return [(pattern_id, start, length-start) - for (pattern_id, start), length in longest_matches] + print(longest_matches) + return [(pattern_id, start, start+length) + for (pattern_id, start), length in longest_matches.items()] cdef void transition(vector[MatchC]& matches, vector[PatternStateC]& nexts, @@ -92,14 +93,15 @@ cdef void transition(vector[MatchC]& matches, vector[PatternStateC]& nexts, if state.start == -1: state.start = token if action.is_match: + ent_id = state.state[1].attrs.value matches.push_back( - MatchC(pattern_id=state.pattern_id, start=state.start, end=token+1)) + MatchC(pattern_id=ent_id, start=state.start, end=token+1)) if action.keep_state: - nexts.push_back(PatternStateC(pattern_id=pattern_id, - start=state.start, state=state.state, last_action=action)) + nexts.push_back(PatternStateC(start=state.start, state=state.state, + last_action=action)) if action.advance_state: - nexts.push_back(PatternStateC(pattern_id=pattern_id, - start=state.start, state=state.state+1, last_action=action)) + nexts.push_back(PatternStateC(start=state.start, + state=state.state+1, last_action=action)) cdef ActionC get_action(PatternStateC state, const TokenC* token, const attr_t* extra_attrs, @@ -387,6 +389,12 @@ cdef class Matcher: matches = find_matches(&self.patterns[0], self.patterns.size(), doc) return matches + def _normalize_key(self, key): + if isinstance(key, basestring): + return self.vocab.strings.add(key) + else: + return key + def unpickle_matcher(vocab, patterns, callbacks): matcher = Matcher(vocab) From 9115c3ba0a7f2612f5a1ac550d25cc565fb86814 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 12 Feb 2018 12:06:48 +0100 Subject: [PATCH 05/23] Add TODO in notes --- spacy/_matcher2_notes.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/spacy/_matcher2_notes.py b/spacy/_matcher2_notes.py index 56fd4ca15..1cf151ea0 100644 --- a/spacy/_matcher2_notes.py +++ b/spacy/_matcher2_notes.py @@ -37,10 +37,11 @@ def transition(state, token, matches, nexts): start = token.i if is_match: matches.append((pattern, start, token.i+1)) - if keep_state: - nexts.append((pattern, i, start)) if advance_state: nexts.append((pattern, i+1, start)) + if keep_state: + # TODO: This needs to be zero-width :(. + nexts.append((pattern, i, start)) return (matches, nexts) @@ -92,7 +93,7 @@ def get_action(state, token): elif is_match: return '011' else: - return '010' + return '001' else: print(operator, is_match, is_final) raise ValueError @@ -245,7 +246,7 @@ def test_find_matches_greedy(): assert matches == [(patterns[0], 0, 1), (patterns[0], 0, 2), (patterns[0], 1, 2)] def test_find_matches_non_greedy(): - patterns = [[{'spec': 'a', 'op': '0+'}, {'spec': 'b'}]] + patterns = [[{'spec': 'a', 'op': '0+'}, {'spec': 'b', "op": "1"}]] doc = Doc(Vocab(), words=['b']) matches = find_matches(patterns, doc) assert matches == [(patterns[0], 0, 1)] From 1b01685f47fe8e952ae59fa203679813a2ade612 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 12 Feb 2018 12:28:03 +0100 Subject: [PATCH 06/23] Fix ZERO_PLUS operator --- spacy/matcher2.pyx | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/spacy/matcher2.pyx b/spacy/matcher2.pyx index 3bab60ede..37aa5ed61 100644 --- a/spacy/matcher2.pyx +++ b/spacy/matcher2.pyx @@ -68,13 +68,11 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc): cache = PreshMap() nexts.clear() for j in range(curr_states.size()): - action = get_action(curr_states[j], &doc.c[i], extra_attrs[i], cache) transition(matches, nexts, - action, curr_states[j], i) + curr_states[j], i, doc, extra_attrs, cache) for j in range(init_states.size()): - action = get_action(init_states[j], &doc.c[i], extra_attrs[i], cache) transition(matches, nexts, - action, init_states[j], i) + init_states[j], i, doc, extra_attrs, cache) nexts, curr_states = curr_states, nexts # Filter out matches that have a longer equivalent. longest_matches = {} @@ -89,19 +87,26 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc): cdef void transition(vector[MatchC]& matches, vector[PatternStateC]& nexts, - ActionC action, PatternStateC state, int token) except *: + PatternStateC state, int token, + Doc doc, const attr_t* const* extra_attrs, PreshMap cache) except *: + action = get_action(state, &doc.c[token], extra_attrs[token], cache) if state.start == -1: state.start = token if action.is_match: ent_id = state.state[1].attrs.value matches.push_back( MatchC(pattern_id=ent_id, start=state.start, end=token+1)) - if action.keep_state: - nexts.push_back(PatternStateC(start=state.start, state=state.state, - last_action=action)) if action.advance_state: nexts.push_back(PatternStateC(start=state.start, state=state.state+1, last_action=action)) + cdef PatternStateC next_state + if action.keep_state and token < doc.length: + # Keeping the state needs to not consume a token, so we call transition + # with the next state + next_state = PatternStateC(start=state.start, state=state.state+1, + last_action=action) + transition(matches, nexts, next_state, token, doc, extra_attrs, cache) + cdef ActionC get_action(PatternStateC state, const TokenC* token, const attr_t* extra_attrs, From b4cc39eb74b4390d17a4f0e7f71ad4e476006c09 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 13 Feb 2018 11:45:32 +0100 Subject: [PATCH 07/23] Fix zero-width quantifiers. Passes test_matcher --- spacy/matcher2.pyx | 213 ++++++++++++++++++++++++++++----------------- 1 file changed, 135 insertions(+), 78 deletions(-) diff --git a/spacy/matcher2.pyx b/spacy/matcher2.pyx index 37aa5ed61..4545a2f31 100644 --- a/spacy/matcher2.pyx +++ b/spacy/matcher2.pyx @@ -35,28 +35,30 @@ cdef struct TokenPatternC: cdef struct ActionC: - char is_match - char keep_state - char advance_state + char emit_match + char next_state_next_token + char next_state_same_token + char same_state_next_token cdef struct PatternStateC: - TokenPatternC* state + TokenPatternC* pattern int32_t start - ActionC last_action + int32_t length cdef struct MatchC: attr_t pattern_id int32_t start - int32_t end + int32_t length cdef find_matches(TokenPatternC** patterns, int n, Doc doc): + print("N patterns: ", n) cdef vector[PatternStateC] init_states - cdef ActionC null_action = ActionC(-1, -1, -1) + cdef ActionC null_action = ActionC(-1, -1, -1, -1) for i in range(n): - init_states.push_back(PatternStateC(patterns[i], -1, last_action=null_action)) + init_states.push_back(PatternStateC(patterns[i], -1, 0)) cdef vector[PatternStateC] curr_states cdef vector[PatternStateC] nexts cdef vector[MatchC] matches @@ -65,48 +67,65 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc): # TODO: Prefill this with the extra attribute values. extra_attrs = mem.alloc(len(doc), sizeof(attr_t*)) for i in range(doc.length): - cache = PreshMap() nexts.clear() + cache = PreshMap() for j in range(curr_states.size()): transition(matches, nexts, - curr_states[j], i, doc, extra_attrs, cache) + curr_states[j], i, &doc.c[i], extra_attrs[i], cache) for j in range(init_states.size()): transition(matches, nexts, - init_states[j], i, doc, extra_attrs, cache) + init_states[j], i, &doc.c[i], extra_attrs[i], cache) nexts, curr_states = curr_states, nexts + # Handle patterns that end with zero-width + for j in range(curr_states.size()): + state = curr_states[j] + while get_quantifier(state) in (ZERO_PLUS, ZERO_ONE): + is_final = get_is_final(state) + if is_final: + ent_id = state.pattern[1].attrs.value + matches.push_back( + MatchC(pattern_id=ent_id, start=state.start, length=state.length)) + break + else: + state.pattern += 1 # Filter out matches that have a longer equivalent. longest_matches = {} for i in range(matches.size()): key = (matches[i].pattern_id, matches[i].start) - length = matches[i].end - matches[i].start + length = matches[i].length if key not in longest_matches or length > longest_matches[key]: longest_matches[key] = length - print(longest_matches) return [(pattern_id, start, start+length) for (pattern_id, start), length in longest_matches.items()] cdef void transition(vector[MatchC]& matches, vector[PatternStateC]& nexts, - PatternStateC state, int token, - Doc doc, const attr_t* const* extra_attrs, PreshMap cache) except *: - action = get_action(state, &doc.c[token], extra_attrs[token], cache) + PatternStateC state, int i, const TokenC* token, const attr_t* extra_attrs, + PreshMap cache) except *: + action = get_action(state, token, extra_attrs, cache) if state.start == -1: - state.start = token - if action.is_match: - ent_id = state.state[1].attrs.value + state.start = i + if action.emit_match == 1: + ent_id = state.pattern[1].attrs.value matches.push_back( - MatchC(pattern_id=ent_id, start=state.start, end=token+1)) - if action.advance_state: + MatchC(pattern_id=ent_id, start=state.start, length=state.length+1)) + elif action.emit_match == 2: + ent_id = state.pattern[1].attrs.value + matches.push_back( + MatchC(pattern_id=ent_id, start=state.start, length=state.length)) + if action.next_state_next_token: nexts.push_back(PatternStateC(start=state.start, - state=state.state+1, last_action=action)) + pattern=&state.pattern[1], length=state.length+1)) + if action.same_state_next_token: + nexts.push_back(PatternStateC(start=state.start, + pattern=state.pattern, length=state.length+1)) cdef PatternStateC next_state - if action.keep_state and token < doc.length: - # Keeping the state needs to not consume a token, so we call transition - # with the next state - next_state = PatternStateC(start=state.start, state=state.state+1, - last_action=action) - transition(matches, nexts, next_state, token, doc, extra_attrs, cache) - + if action.next_state_same_token: + # 0+ and ? non-matches need to not consume a token, so we call transition + # with the same state + next_state = PatternStateC(start=state.start, pattern=&state.pattern[1], + length=state.length) + transition(matches, nexts, next_state, i, token, extra_attrs, cache) cdef ActionC get_action(PatternStateC state, const TokenC* token, const attr_t* extra_attrs, @@ -117,74 +136,108 @@ cdef ActionC get_action(PatternStateC state, const TokenC* token, const attr_t* b) What's the quantifier? [1, 0+, ?] c) Is this the last specification? [final, non-final] - We therefore have 12 cases to consider. For each case, we need to know - whether to emit a match, whether to keep the current state in the partials, - and whether to add an advanced state to the partials. + We can transition in the following ways: - We therefore have eight possible results for these three booleans, which - we'll code as 000, 001 etc. + a) Do we emit a match? + b) Do we add a state with (next state, next token)? + c) Do we add a state with (next state, same token)? + d) Do we add a state with (same state, next token)? + + We'll code the actions as boolean strings, so 0000 means no to all 4, + 1000 means match but no states added, etc. 1: - - Match, final: - 100 - - Match, non-final: - 001 - - No match: - 000 + Yes, final: + 1000 + Yes, non-final: + 0100 + No, final: + 0000 + No, non-final + 0000 0+: - - Match, final: - 100 - - Match, non-final: - 011 - - Non-match, final: - 100 - - Non-match, non-final: - 010 + Yes, final: + 1001 + Yes, non-final: + 0011 + No, final: + 1000 (note: Don't include last token!) + No, non-final: + 0010 + ?: + Yes, final: + 1000 + Yes, non-final: + 0100 + No, final: + 1000 (note: Don't include last token!) + No, non-final: + 0010 Problem: If a quantifier is matching, we're adding a lot of open partials - Question: Is it worth doing a lookahead, to see if we add? ''' - cached_match = cache.get(state.state.key) + cached_match = cache.get(state.pattern.key) cdef char is_match if cached_match == 0: is_match = get_is_match(state, token, extra_attrs) cached_match = is_match + 1 - cache.set(state.state.key, cached_match) + cache.set(state.pattern.key, cached_match) elif cached_match == 1: is_match = 0 else: is_match = 1 - quantifier = get_quantifier(state, token) - is_final = get_is_final(state, token) + quantifier = get_quantifier(state) + is_final = get_is_final(state) + if quantifier == ZERO: + is_match = not is_match + quantifier = ONE if quantifier == ONE: - if not is_match: - return ActionC(is_match=0, keep_state=0, advance_state=0) - elif is_final: - return ActionC(is_match=1, keep_state=0, advance_state=0) - else: - return ActionC(is_match=0, keep_state=0, advance_state=1) + if is_match and is_final: + # Yes, final: 1000 + return ActionC(1, 0, 0, 0) + elif is_match and not is_final: + # Yes, non-final: 0100 + return ActionC(0, 1, 0, 0) + elif not is_match and is_final: + # No, final: 0000 + return ActionC(0, 0, 0, 0) + else: + # No, non-final 0000 + return ActionC(0, 0, 0, 0) + elif quantifier == ZERO_PLUS: - if is_final: - return ActionC(is_match=1, keep_state=0, advance_state=0) - elif is_match: - return ActionC(is_match=0, keep_state=1, advance_state=1) - else: - return ActionC(is_match=0, keep_state=1, advance_state=0) + if is_match and is_final: + # Yes, final: 1001 + return ActionC(1, 0, 0, 1) + elif is_match and not is_final: + # Yes, non-final: 0011 + return ActionC(0, 0, 1, 1) + elif not is_match and is_final: + # No, final 1000 (note: Don't include last token!) + return ActionC(2, 0, 0, 0) + else: + # No, non-final 0010 + return ActionC(0, 0, 1, 0) elif quantifier == ZERO_ONE: - if is_final: - return ActionC(is_match=1, keep_state=0, advance_state=0) - elif is_match: - if state.last_action.keep_state: - return ActionC(is_match=0, keep_state=0, advance_state=1) - else: - return ActionC(is_match=0, keep_state=1, advance_state=1) + if is_match and is_final: + # Yes, final: 1000 + return ActionC(1, 0, 0, 0) + elif is_match and not is_final: + # Yes, non-final: 0100 + return ActionC(0, 1, 0, 0) + elif not is_match and is_final: + # No, final 1000 (note: Don't include last token!) + return ActionC(2, 0, 0, 0) + else: + # No, non-final 0010 + return ActionC(0, 0, 1, 0) else: print(quantifier, is_match, is_final) raise ValueError cdef char get_is_match(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil: - spec = state.state + spec = state.pattern for attr in spec.attrs[:spec.nr_attr]: if get_token_attr(token, attr.attr) != attr.value: return 0 @@ -192,15 +245,15 @@ cdef char get_is_match(PatternStateC state, const TokenC* token, const attr_t* e return 1 -cdef char get_is_final(PatternStateC state, const TokenC* token) nogil: - if state.state[1].attrs[0].attr == ID and state.state[1].nr_attr == 0: +cdef char get_is_final(PatternStateC state) nogil: + if state.pattern[1].attrs[0].attr == ID and state.pattern[1].nr_attr == 0: return 1 else: return 0 -cdef char get_quantifier(PatternStateC state, const TokenC* token) nogil: - return state.state.quantifier +cdef char get_quantifier(PatternStateC state) nogil: + return state.pattern.quantifier cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, @@ -232,7 +285,7 @@ cdef attr_t get_pattern_key(const TokenPatternC* pattern) nogil: def _convert_strings(token_specs, string_store): # Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS operators = {'*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS), - '?': (ZERO_ONE,), '1': (ONE,)} + '?': (ZERO_ONE,), '1': (ONE,), '!': (ZERO,)} tokens = [] op = ONE for spec in token_specs: @@ -392,6 +445,10 @@ cdef class Matcher: `doc[start:end]`. The `label_id` and `key` are both integers. """ matches = find_matches(&self.patterns[0], self.patterns.size(), doc) + for i, (key, start, end) in enumerate(matches): + on_match = self._callbacks.get(key, None) + if on_match is not None: + on_match(self, doc, i, matches) return matches def _normalize_key(self, key): From 0004331895f625c4660400b7b766d9d2e07fffe0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 13 Feb 2018 11:45:45 +0100 Subject: [PATCH 08/23] Update notes on matcher2 --- spacy/_matcher2_notes.py | 75 ++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/spacy/_matcher2_notes.py b/spacy/_matcher2_notes.py index 1cf151ea0..ece1c9d48 100644 --- a/spacy/_matcher2_notes.py +++ b/spacy/_matcher2_notes.py @@ -49,54 +49,53 @@ def get_action(state, token): '''We need to consider: a) Does the token match the specification? [Yes, No] - b) What's the quantifier? [1, 1+, 0+] + b) What's the quantifier? [1, 0+, ?] c) Is this the last specification? [final, non-final] - We therefore have 12 cases to consider. For each case, we need to know - whether to emit a match, whether to keep the current state in the partials, - and whether to add an advanced state to the partials. + We can transition in the following ways: - We therefore have eight possible results for these three booleans, which - we'll code as 000, 001 etc. + a) Do we emit a match? + b) Do we add a state with (next state, next token)? + c) Do we add a state with (next state, same token)? + d) Do we add a state with (same state, next token)? + + We'll code the actions as boolean strings, so 0000 means no to all 4, + 1000 means match but no states added, etc. - - No match: - 000 - - Match, final: - 1: 100 - 1+: 110 - - Match, non-final: - 1: 001 - 1+: 011 + 1: + Yes, final: + 1000 + Yes, non-final: + 0100 + No, final: + 0000 + No, non-final + 0000 + 0+: + Yes, final: + 1001 + Yes, non-final: + 0111 + No, final: + 1000 (note: Don't include last token!) + No, non-final: + 0010 + ?: + Yes, final: + 1000 + Yes, non-final: + 0100 + No, final: + 1000 (note: Don't include last token!) + No, non-final: + 0010 Problem: If a quantifier is matching, we're adding a lot of open partials ''' is_match = get_is_match(state, token) operator = get_operator(state, token) is_final = get_is_final(state, token) - if operator == '1': - if not is_match: - return '000' - elif is_final: - return '100' - else: - return '001' - elif operator == '1+': - if not is_match: - return '000' - if is_final: - return '110' - else: - return '011' - elif operator == '0+': - if is_final: - return '100' - elif is_match: - return '011' - else: - return '001' - else: - print(operator, is_match, is_final) - raise ValueError + raise NotImplementedError def get_is_match(state, token): From 9efda9e9abec9e0303787671adab007c48cc8629 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 13 Feb 2018 16:27:46 +0100 Subject: [PATCH 09/23] Add PhraseMatcher in matcher2.pyx --- spacy/matcher2.pyx | 195 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 194 insertions(+), 1 deletion(-) diff --git a/spacy/matcher2.pyx b/spacy/matcher2.pyx index 4545a2f31..d3de94911 100644 --- a/spacy/matcher2.pyx +++ b/spacy/matcher2.pyx @@ -12,6 +12,34 @@ from .tokens.doc cimport Doc from .tokens.doc cimport get_token_attr from .attrs cimport ID, attr_id_t, NULL_ATTR from .attrs import IDS +from .attrs import FLAG61 as U_ENT +from .attrs import FLAG60 as B2_ENT +from .attrs import FLAG59 as B3_ENT +from .attrs import FLAG58 as B4_ENT +from .attrs import FLAG57 as B5_ENT +from .attrs import FLAG56 as B6_ENT +from .attrs import FLAG55 as B7_ENT +from .attrs import FLAG54 as B8_ENT +from .attrs import FLAG53 as B9_ENT +from .attrs import FLAG52 as B10_ENT +from .attrs import FLAG51 as I3_ENT +from .attrs import FLAG50 as I4_ENT +from .attrs import FLAG49 as I5_ENT +from .attrs import FLAG48 as I6_ENT +from .attrs import FLAG47 as I7_ENT +from .attrs import FLAG46 as I8_ENT +from .attrs import FLAG45 as I9_ENT +from .attrs import FLAG44 as I10_ENT +from .attrs import FLAG43 as L2_ENT +from .attrs import FLAG42 as L3_ENT +from .attrs import FLAG41 as L4_ENT +from .attrs import FLAG40 as L5_ENT +from .attrs import FLAG39 as L6_ENT +from .attrs import FLAG38 as L7_ENT +from .attrs import FLAG37 as L8_ENT +from .attrs import FLAG36 as L9_ENT +from .attrs import FLAG35 as L10_ENT + cdef enum quantifier_t: @@ -435,6 +463,20 @@ cdef class Matcher: if key not in self._patterns: return default return (self._callbacks[key], self._patterns[key]) + + def pipe(self, docs, batch_size=1000, n_threads=2): + """Match a stream of documents, yielding them in turn. + + docs (iterable): A stream of documents. + batch_size (int): Number of documents to accumulate into a working set. + n_threads (int): The number of threads with which to work on the buffer + in parallel, if the implementation supports multi-threading. + YIELDS (Doc): Documents, in order. + """ + for doc in docs: + self(doc) + yield doc + def __call__(self, Doc doc): """Find all token sequences matching the supplied pattern. @@ -466,4 +508,155 @@ def unpickle_matcher(vocab, patterns, callbacks): return matcher - +def get_bilou(length): + if length == 1: + return [U_ENT] + elif length == 2: + return [B2_ENT, L2_ENT] + elif length == 3: + return [B3_ENT, I3_ENT, L3_ENT] + elif length == 4: + return [B4_ENT, I4_ENT, I4_ENT, L4_ENT] + elif length == 5: + return [B5_ENT, I5_ENT, I5_ENT, I5_ENT, L5_ENT] + elif length == 6: + return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT] + elif length == 7: + return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT] + elif length == 8: + return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT] + elif length == 9: + return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, + L9_ENT] + elif length == 10: + return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, + I10_ENT, I10_ENT, L10_ENT] + else: + raise ValueError("Max length currently 10 for phrase matching") + + +cdef class PhraseMatcher: + cdef Pool mem + cdef Vocab vocab + cdef Matcher matcher + cdef PreshMap phrase_ids + cdef int max_length + cdef attr_t* _phrase_key + cdef public object _callbacks + cdef public object _patterns + + def __init__(self, Vocab vocab, max_length=10): + self.mem = Pool() + self._phrase_key = self.mem.alloc(max_length, sizeof(attr_t)) + self.max_length = max_length + self.vocab = vocab + self.matcher = Matcher(self.vocab) + self.phrase_ids = PreshMap() + abstract_patterns = [] + for length in range(1, max_length): + abstract_patterns.append([{tag: True} + for tag in get_bilou(length)]) + self.matcher.add('Candidate', None, *abstract_patterns) + self._callbacks = {} + + def __len__(self): + """Get the number of rules added to the matcher. Note that this only + returns the number of rules (identical with the number of IDs), not the + number of individual patterns. + + RETURNS (int): The number of rules. + """ + return len(self.phrase_ids) + + def __contains__(self, key): + """Check whether the matcher contains rules for a match ID. + + key (unicode): The match ID. + RETURNS (bool): Whether the matcher contains rules for this match ID. + """ + cdef hash_t ent_id = self.matcher._normalize_key(key) + return ent_id in self._callbacks + + def __reduce__(self): + return (self.__class__, (self.vocab,), None, None) + + def add(self, key, on_match, *docs): + """Add a match-rule to the matcher. A match-rule consists of: an ID + key, an on_match callback, and one or more patterns. + + key (unicode): The match ID. + on_match (callable): Callback executed on match. + *docs (Doc): `Doc` objects representing match patterns. + """ + cdef Doc doc + for doc in docs: + if len(doc) >= self.max_length: + msg = ( + "Pattern length (%d) >= phrase_matcher.max_length (%d). " + "Length can be set on initialization, up to 10." + ) + raise ValueError(msg % (len(doc), self.max_length)) + cdef hash_t ent_id = self.matcher._normalize_key(key) + self._callbacks[ent_id] = on_match + cdef int length + cdef int i + cdef hash_t phrase_hash + for doc in docs: + length = doc.length + tags = get_bilou(length) + for i in range(self.max_length): + self._phrase_key[i] = 0 + for i, tag in enumerate(tags): + lexeme = self.vocab[doc.c[i].lex.orth] + lexeme.set_flag(tag, True) + self._phrase_key[i] = lexeme.orth + phrase_hash = hash64(self._phrase_key, + self.max_length * sizeof(attr_t), 0) + self.phrase_ids.set(phrase_hash, ent_id) + + def __call__(self, Doc doc): + """Find all sequences matching the supplied patterns on the `Doc`. + + doc (Doc): The document to match over. + RETURNS (list): A list of `(key, start, end)` tuples, + describing the matches. A match tuple describes a span + `doc[start:end]`. The `label_id` and `key` are both integers. + """ + matches = [] + for _, start, end in self.matcher(doc): + ent_id = self.accept_match(doc, start, end) + if ent_id is not None: + matches.append((ent_id, start, end)) + for i, (ent_id, start, end) in enumerate(matches): + on_match = self._callbacks.get(ent_id) + if on_match is not None: + on_match(self, doc, i, matches) + return matches + + def pipe(self, stream, batch_size=1000, n_threads=2): + """Match a stream of documents, yielding them in turn. + + docs (iterable): A stream of documents. + batch_size (int): Number of documents to accumulate into a working set. + n_threads (int): The number of threads with which to work on the buffer + in parallel, if the implementation supports multi-threading. + YIELDS (Doc): Documents, in order. + """ + for doc in stream: + self(doc) + yield doc + + def accept_match(self, Doc doc, int start, int end): + assert (end - start) < self.max_length + cdef int i, j + for i in range(self.max_length): + self._phrase_key[i] = 0 + for i, j in enumerate(range(start, end)): + self._phrase_key[i] = doc.c[j].lex.orth + cdef hash_t key = hash64(self._phrase_key, + self.max_length * sizeof(attr_t), 0) + ent_id = self.phrase_ids.get(key) + if ent_id == 0: + return None + else: + return ent_id From 6d7986b0f191f212485226d790cf04e5806674c5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 13 Feb 2018 16:28:06 +0100 Subject: [PATCH 10/23] Fix matcher test --- spacy/tests/test_matcher.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/tests/test_matcher.py b/spacy/tests/test_matcher.py index 8210467ea..d585a9255 100644 --- a/spacy/tests/test_matcher.py +++ b/spacy/tests/test_matcher.py @@ -1,7 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals -from ..matcher import Matcher, PhraseMatcher +from ..matcher2 import Matcher +from ..matcher2 import PhraseMatcher from .util import get_doc from ..tokens import Doc @@ -186,6 +187,7 @@ def test_matcher_match_zero_plus(matcher): pattern = [{'ORTH': '"'}, {'OP': '*', 'IS_PUNCT': False}, {'ORTH': '"'}] + matcher = Matcher(matcher.vocab) matcher.add('Quote', None, pattern) doc = get_doc(matcher.vocab, words) assert len(matcher(doc)) == 1 From 9bdfa5cd4f8f5e986f4e0fddc1d9c3c8cf80b6b0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 13 Feb 2018 16:28:52 +0100 Subject: [PATCH 11/23] Remove re comparisons tests, as matcher behaves differently --- spacy/tests/test_matcher_greedy.py | 63 ------------------------------ 1 file changed, 63 deletions(-) delete mode 100644 spacy/tests/test_matcher_greedy.py diff --git a/spacy/tests/test_matcher_greedy.py b/spacy/tests/test_matcher_greedy.py deleted file mode 100644 index 882c356ca..000000000 --- a/spacy/tests/test_matcher_greedy.py +++ /dev/null @@ -1,63 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals -import re - -from ..matcher import Matcher - -import pytest - -pattern1 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}] -pattern2 = [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}] -pattern3 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}] -pattern4 = [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}] -pattern5 = [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}] - -re_pattern1 = 'AA*' -re_pattern2 = 'A*A' -re_pattern3 = 'AA' -re_pattern4 = 'BA*B' -re_pattern5 = 'B*A*B' - -@pytest.fixture -def text(): - return "(ABBAAAAAB)." - -@pytest.fixture -def doc(en_tokenizer,text): - doc = en_tokenizer(' '.join(text)) - return doc - -@pytest.mark.parametrize('pattern,re_pattern',[ - (pattern1,re_pattern1), - (pattern2,re_pattern2), - (pattern3,re_pattern3), - (pattern4,re_pattern4), - (pattern5,re_pattern5)]) -def test_greedy_matching(doc,text,pattern,re_pattern): - """ - Test that the greedy matching behavior of the * op - is consistant with other re implementations - """ - matcher = Matcher(doc.vocab) - matcher.add(re_pattern,None,pattern) - matches = matcher(doc) - re_matches = [m.span() for m in re.finditer(re_pattern,text)] - for match,re_match in zip(matches,re_matches): - assert match[1:]==re_match - -@pytest.mark.parametrize('pattern,re_pattern',[ - (pattern1,re_pattern1), - (pattern2,re_pattern2), - (pattern3,re_pattern3), - (pattern4,re_pattern4), - (pattern5,re_pattern5)]) -def test_match_consuming(doc,text,pattern,re_pattern): - """ - Test that matcher.__call__ consumes tokens on a match - similar to re.findall - """ - matcher = Matcher(doc.vocab) - matcher.add(re_pattern,None,pattern) - matches = matcher(doc) - re_matches = [m.span() for m in re.finditer(re_pattern,text)] - assert len(matches)==len(re_matches) \ No newline at end of file From dcd8d89aef112d165b94bc65099143d5576b21c8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 13 Feb 2018 16:35:20 +0100 Subject: [PATCH 12/23] Update test for 850, making it work with matcher2 --- spacy/tests/regression/test_issue1945.py | 4 ++-- spacy/tests/regression/test_issue850.py | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/spacy/tests/regression/test_issue1945.py b/spacy/tests/regression/test_issue1945.py index 3b3179f64..59135033a 100644 --- a/spacy/tests/regression/test_issue1945.py +++ b/spacy/tests/regression/test_issue1945.py @@ -4,9 +4,9 @@ import pytest from ...vocab import Vocab from ...tokens import Doc -from ...matcher import Matcher +from ...matcher2 import Matcher -@pytest.mark.xfail +#@pytest.mark.xfail def test_issue1945(): text = "a a a" matcher = Matcher(Vocab()) diff --git a/spacy/tests/regression/test_issue850.py b/spacy/tests/regression/test_issue850.py index 01bc19fb9..e3611c4a6 100644 --- a/spacy/tests/regression/test_issue850.py +++ b/spacy/tests/regression/test_issue850.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import pytest -from ...matcher import Matcher +from ...matcher2 import Matcher from ...vocab import Vocab from ...attrs import LOWER from ...tokens import Doc @@ -22,10 +22,9 @@ def test_basic_case(): assert end == 4 -@pytest.mark.xfail def test_issue850(): - """The problem here is that the variable-length pattern matches the - succeeding token. We then don't handle the ambiguity correctly.""" + """The variable-length pattern matches the + succeeding token. Check we handle the ambiguity correctly.""" matcher = Matcher(Vocab( lex_attr_getters={LOWER: lambda string: string.lower()})) IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True) From f43d53f2c5dd88b4729c01ecf8ae78bd5823b295 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 13 Feb 2018 17:15:07 +0100 Subject: [PATCH 13/23] Remove print statement --- spacy/matcher2.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/matcher2.pyx b/spacy/matcher2.pyx index d3de94911..2ec32a5e8 100644 --- a/spacy/matcher2.pyx +++ b/spacy/matcher2.pyx @@ -82,7 +82,6 @@ cdef struct MatchC: cdef find_matches(TokenPatternC** patterns, int n, Doc doc): - print("N patterns: ", n) cdef vector[PatternStateC] init_states cdef ActionC null_action = ActionC(-1, -1, -1, -1) for i in range(n): From 262cbe356e2e60515ab8f52174d3660c24727621 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 13 Feb 2018 17:15:20 +0100 Subject: [PATCH 14/23] Remove caching, as doesn't seem to help for now. --- spacy/matcher2.pyx | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/spacy/matcher2.pyx b/spacy/matcher2.pyx index 2ec32a5e8..98ac92b84 100644 --- a/spacy/matcher2.pyx +++ b/spacy/matcher2.pyx @@ -203,16 +203,16 @@ cdef ActionC get_action(PatternStateC state, const TokenC* token, const attr_t* Problem: If a quantifier is matching, we're adding a lot of open partials ''' - cached_match = cache.get(state.pattern.key) + #cached_match = cache.get(state.pattern.key) cdef char is_match - if cached_match == 0: - is_match = get_is_match(state, token, extra_attrs) - cached_match = is_match + 1 - cache.set(state.pattern.key, cached_match) - elif cached_match == 1: - is_match = 0 - else: - is_match = 1 + #if cached_match == 0: + is_match = get_is_match(state, token, extra_attrs) + # cached_match = is_match + 1 + # cache.set(state.pattern.key, cached_match) + #elif cached_match == 1: + # is_match = 0 + #else: + # is_match = 1 quantifier = get_quantifier(state) is_final = get_is_final(state) if quantifier == ZERO: From 00261eea2752f8e6261f568def2b2d19682a3a31 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 14 Feb 2018 12:10:51 +0100 Subject: [PATCH 15/23] Make tests refer to matcher2 --- spacy/tests/regression/test_issue1450.py | 2 +- spacy/tests/regression/test_issue1855.py | 6 ++++-- spacy/tests/regression/test_issue1883.py | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/spacy/tests/regression/test_issue1450.py b/spacy/tests/regression/test_issue1450.py index 3c8f975d9..d099763d2 100644 --- a/spacy/tests/regression/test_issue1450.py +++ b/spacy/tests/regression/test_issue1450.py @@ -1,7 +1,7 @@ from __future__ import unicode_literals import pytest -from ...matcher import Matcher +from ...matcher2 import Matcher from ...tokens import Doc from ...vocab import Vocab diff --git a/spacy/tests/regression/test_issue1855.py b/spacy/tests/regression/test_issue1855.py index aeaad9413..e10af0d60 100644 --- a/spacy/tests/regression/test_issue1855.py +++ b/spacy/tests/regression/test_issue1855.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import re -from ...matcher import Matcher +from ...matcher2 import Matcher import pytest @@ -27,6 +27,7 @@ def doc(en_tokenizer,text): doc = en_tokenizer(' '.join(text)) return doc +@pytest.mark.xfail @pytest.mark.parametrize('pattern,re_pattern',[ (pattern1,re_pattern1), (pattern2,re_pattern2), @@ -45,6 +46,7 @@ def test_greedy_matching(doc,text,pattern,re_pattern): for match,re_match in zip(matches,re_matches): assert match[1:]==re_match +@pytest.mark.xfail @pytest.mark.parametrize('pattern,re_pattern',[ (pattern1,re_pattern1), (pattern2,re_pattern2), @@ -60,4 +62,4 @@ def test_match_consuming(doc,text,pattern,re_pattern): matcher.add(re_pattern,None,pattern) matches = matcher(doc) re_matches = [m.span() for m in re.finditer(re_pattern,text)] - assert len(matches)==len(re_matches) \ No newline at end of file + assert len(matches)==len(re_matches) diff --git a/spacy/tests/regression/test_issue1883.py b/spacy/tests/regression/test_issue1883.py index 3fcf905c1..1c7393d8d 100644 --- a/spacy/tests/regression/test_issue1883.py +++ b/spacy/tests/regression/test_issue1883.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import copy from ... vocab import Vocab -from ... matcher import Matcher +from ... matcher2 import Matcher from ... tokens import Doc From 7885b92b45c98bc2ab45f9034d4aaa1d3c6da035 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 14 Feb 2018 12:11:17 +0100 Subject: [PATCH 16/23] Refactor matcher2, hopefully making it faster --- spacy/matcher2.pyx | 187 ++++++++++++++++++++++++--------------------- 1 file changed, 102 insertions(+), 85 deletions(-) diff --git a/spacy/matcher2.pyx b/spacy/matcher2.pyx index 98ac92b84..35f6eecf8 100644 --- a/spacy/matcher2.pyx +++ b/spacy/matcher2.pyx @@ -1,6 +1,7 @@ # cython: infer_types=True +# cython: profile=True from libcpp.vector cimport vector -from libc.stdint cimport int32_t, uint64_t +from libc.stdint cimport int32_t, uint64_t, uint16_t from preshed.maps cimport PreshMap from cymem.cymem cimport Pool from murmurhash.mrmr cimport hash64 @@ -41,6 +42,15 @@ from .attrs import FLAG36 as L9_ENT from .attrs import FLAG35 as L10_ENT +cdef enum action_t: + REJECT = 0000 + MATCH = 1000 + ADVANCE = 0100 + RETRY = 0010 + RETRY_EXTEND = 0011 + MATCH_EXTEND = 1001 + MATCH_REJECT = 2000 + cdef enum quantifier_t: ZERO @@ -82,39 +92,18 @@ cdef struct MatchC: cdef find_matches(TokenPatternC** patterns, int n, Doc doc): - cdef vector[PatternStateC] init_states - cdef ActionC null_action = ActionC(-1, -1, -1, -1) - for i in range(n): - init_states.push_back(PatternStateC(patterns[i], -1, 0)) - cdef vector[PatternStateC] curr_states - cdef vector[PatternStateC] nexts + cdef vector[PatternStateC] states cdef vector[MatchC] matches - cdef PreshMap cache cdef Pool mem = Pool() # TODO: Prefill this with the extra attribute values. extra_attrs = mem.alloc(len(doc), sizeof(attr_t*)) + # Main loop for i in range(doc.length): - nexts.clear() - cache = PreshMap() - for j in range(curr_states.size()): - transition(matches, nexts, - curr_states[j], i, &doc.c[i], extra_attrs[i], cache) - for j in range(init_states.size()): - transition(matches, nexts, - init_states[j], i, &doc.c[i], extra_attrs[i], cache) - nexts, curr_states = curr_states, nexts - # Handle patterns that end with zero-width - for j in range(curr_states.size()): - state = curr_states[j] - while get_quantifier(state) in (ZERO_PLUS, ZERO_ONE): - is_final = get_is_final(state) - if is_final: - ent_id = state.pattern[1].attrs.value - matches.push_back( - MatchC(pattern_id=ent_id, start=state.start, length=state.length)) - break - else: - state.pattern += 1 + for j in range(n): + states.push_back(PatternStateC(patterns[j], i, 0)) + transition_states(states, matches, &doc.c[i], extra_attrs[i]) + # Handle matches that end in 0-width patterns + finish_states(matches, states) # Filter out matches that have a longer equivalent. longest_matches = {} for i in range(matches.size()): @@ -126,37 +115,67 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc): for (pattern_id, start), length in longest_matches.items()] -cdef void transition(vector[MatchC]& matches, vector[PatternStateC]& nexts, - PatternStateC state, int i, const TokenC* token, const attr_t* extra_attrs, - PreshMap cache) except *: - action = get_action(state, token, extra_attrs, cache) - if state.start == -1: - state.start = i - if action.emit_match == 1: - ent_id = state.pattern[1].attrs.value - matches.push_back( - MatchC(pattern_id=ent_id, start=state.start, length=state.length+1)) - elif action.emit_match == 2: - ent_id = state.pattern[1].attrs.value - matches.push_back( - MatchC(pattern_id=ent_id, start=state.start, length=state.length)) - if action.next_state_next_token: - nexts.push_back(PatternStateC(start=state.start, - pattern=&state.pattern[1], length=state.length+1)) - if action.same_state_next_token: - nexts.push_back(PatternStateC(start=state.start, - pattern=state.pattern, length=state.length+1)) - cdef PatternStateC next_state - if action.next_state_same_token: - # 0+ and ? non-matches need to not consume a token, so we call transition - # with the same state - next_state = PatternStateC(start=state.start, pattern=&state.pattern[1], - length=state.length) - transition(matches, nexts, next_state, i, token, extra_attrs, cache) +cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches, + const TokenC* token, const attr_t* extra_attrs) except *: + cdef int q = 0 + cdef vector[PatternStateC] new_states + for i in range(states.size()): + action = get_action(states[i], token, extra_attrs) + if action == REJECT: + continue + state = states[i] + states[q] = state + while action in (RETRY, RETRY_EXTEND): + if action == RETRY_EXTEND: + new_states.push_back( + PatternStateC(pattern=state.pattern, start=state.start, + length=state.length+1)) + states[q].pattern += 1 + action = get_action(states[q], token, extra_attrs) + if action == REJECT: + pass + elif action == ADVANCE: + states[q].pattern += 1 + states[q].length += 1 + q += 1 + else: + ent_id = state.pattern[1].attrs.value + if action == MATCH: + matches.push_back( + MatchC(pattern_id=ent_id, start=state.start, + length=state.length+1)) + elif action == MATCH_REJECT: + matches.push_back( + MatchC(pattern_id=ent_id, start=state.start, + length=state.length)) + elif action == MATCH_EXTEND: + matches.push_back( + MatchC(pattern_id=ent_id, start=state.start, + length=state.length)) + states[q].length += 1 + q += 1 + states.resize(q) + for i in range(new_states.size()): + states.push_back(new_states[i]) -cdef ActionC get_action(PatternStateC state, const TokenC* token, const attr_t* extra_attrs, - PreshMap cache) except *: +cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states) except *: + '''Handle states that end in zero-width patterns.''' + cdef PatternStateC state + for i in range(states.size()): + state = states[i] + while get_quantifier(state) in (ZERO_PLUS, ZERO_ONE): + is_final = get_is_final(state) + if is_final: + ent_id = state.pattern[1].attrs.value + matches.push_back( + MatchC(pattern_id=ent_id, start=state.start, length=state.length)) + break + else: + state.pattern += 1 + + +cdef action_t get_action(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) except *: '''We need to consider: a) Does the token match the specification? [Yes, No] @@ -201,18 +220,21 @@ cdef ActionC get_action(PatternStateC state, const TokenC* token, const attr_t* No, non-final: 0010 + Possible combinations: 1000, 0100, 0000, 1001, 0011, 0010, + + We'll name the bits "match", "advance", "retry", "extend" + REJECT = 0000 + MATCH = 1000 + ADVANCE = 0100 + RETRY = 0010 + MATCH_EXTEND = 1001 + RETRY_EXTEND = 0011 + MATCH_REJECT = 2000 # Match, but don't include last token + Problem: If a quantifier is matching, we're adding a lot of open partials ''' - #cached_match = cache.get(state.pattern.key) cdef char is_match - #if cached_match == 0: is_match = get_is_match(state, token, extra_attrs) - # cached_match = is_match + 1 - # cache.set(state.pattern.key, cached_match) - #elif cached_match == 1: - # is_match = 0 - #else: - # is_match = 1 quantifier = get_quantifier(state) is_final = get_is_final(state) if quantifier == ZERO: @@ -221,46 +243,41 @@ cdef ActionC get_action(PatternStateC state, const TokenC* token, const attr_t* if quantifier == ONE: if is_match and is_final: # Yes, final: 1000 - return ActionC(1, 0, 0, 0) + return MATCH elif is_match and not is_final: # Yes, non-final: 0100 - return ActionC(0, 1, 0, 0) + return ADVANCE elif not is_match and is_final: # No, final: 0000 - return ActionC(0, 0, 0, 0) + return REJECT else: - # No, non-final 0000 - return ActionC(0, 0, 0, 0) - + return REJECT elif quantifier == ZERO_PLUS: if is_match and is_final: # Yes, final: 1001 - return ActionC(1, 0, 0, 1) + return MATCH_EXTEND elif is_match and not is_final: # Yes, non-final: 0011 - return ActionC(0, 0, 1, 1) + return RETRY_EXTEND elif not is_match and is_final: - # No, final 1000 (note: Don't include last token!) - return ActionC(2, 0, 0, 0) + # No, final 2000 (note: Don't include last token!) + return MATCH_REJECT else: # No, non-final 0010 - return ActionC(0, 0, 1, 0) + return RETRY elif quantifier == ZERO_ONE: if is_match and is_final: # Yes, final: 1000 - return ActionC(1, 0, 0, 0) + return MATCH elif is_match and not is_final: # Yes, non-final: 0100 - return ActionC(0, 1, 0, 0) + return ADVANCE elif not is_match and is_final: - # No, final 1000 (note: Don't include last token!) - return ActionC(2, 0, 0, 0) + # No, final 2000 (note: Don't include last token!) + return MATCH_REJECT else: # No, non-final 0010 - return ActionC(0, 0, 1, 0) - else: - print(quantifier, is_match, is_final) - raise ValueError + return RETRY cdef char get_is_match(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil: From d19dc678868c636bb238800ebbe6de79d4772ea2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 14 Feb 2018 12:16:36 +0100 Subject: [PATCH 17/23] Make get_action nogil, for efficiency --- spacy/matcher2.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/matcher2.pyx b/spacy/matcher2.pyx index 35f6eecf8..5b3675758 100644 --- a/spacy/matcher2.pyx +++ b/spacy/matcher2.pyx @@ -175,7 +175,7 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states) state.pattern += 1 -cdef action_t get_action(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) except *: +cdef action_t get_action(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil: '''We need to consider: a) Does the token match the specification? [Yes, No] From 9ebf2fe7c3b62826aa219b886211325c68e85c9b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 15 Feb 2018 15:26:15 +0100 Subject: [PATCH 18/23] Make helper function to get longest matches --- spacy/matcher2.pyx | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/spacy/matcher2.pyx b/spacy/matcher2.pyx index 5b3675758..59213bfc1 100644 --- a/spacy/matcher2.pyx +++ b/spacy/matcher2.pyx @@ -94,25 +94,21 @@ cdef struct MatchC: cdef find_matches(TokenPatternC** patterns, int n, Doc doc): cdef vector[PatternStateC] states cdef vector[MatchC] matches + cdef PatternStateC state cdef Pool mem = Pool() # TODO: Prefill this with the extra attribute values. extra_attrs = mem.alloc(len(doc), sizeof(attr_t*)) # Main loop + cdef int i, j for i in range(doc.length): for j in range(n): states.push_back(PatternStateC(patterns[j], i, 0)) transition_states(states, matches, &doc.c[i], extra_attrs[i]) # Handle matches that end in 0-width patterns finish_states(matches, states) - # Filter out matches that have a longer equivalent. - longest_matches = {} - for i in range(matches.size()): - key = (matches[i].pattern_id, matches[i].start) - length = matches[i].length - if key not in longest_matches or length > longest_matches[key]: - longest_matches[key] = length - return [(pattern_id, start, start+length) - for (pattern_id, start), length in longest_matches.items()] + return [(matches[i].pattern_id, matches[i].start, matches[i].start+matches[i].length) + for i in range(matches.size())] + cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches, @@ -493,7 +489,6 @@ cdef class Matcher: self(doc) yield doc - def __call__(self, Doc doc): """Find all token sequences matching the supplied pattern. @@ -524,6 +519,18 @@ def unpickle_matcher(vocab, patterns, callbacks): return matcher +def _get_longest_matches(matches): + '''Filter out matches that have a longer equivalent.''' + longest_matches = {} + for pattern_id, start, end in matches: + key = (pattern_id, start) + length = end-start + if key not in longest_matches or length > longest_matches[key]: + longest_matches[key] = length + return [(pattern_id, start, start+length) + for (pattern_id, start), length in longest_matches.items()] + + def get_bilou(length): if length == 1: return [U_ENT] From 1c1960542611df5b9cc9f9c108fa0c85429ea666 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 15 Feb 2018 15:27:03 +0100 Subject: [PATCH 19/23] Move matcher2.pyx to matcher.pyx --- spacy/matcher.pyx | 604 ++++++++++++++++++--------------------- spacy/matcher2.pyx | 685 --------------------------------------------- 2 files changed, 269 insertions(+), 1020 deletions(-) delete mode 100644 spacy/matcher2.pyx diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 501fc5e5d..59213bfc1 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -1,30 +1,18 @@ -# cython: profile=True # cython: infer_types=True -# coding: utf8 -from __future__ import unicode_literals - -import ujson -from cymem.cymem cimport Pool -from preshed.maps cimport PreshMap +# cython: profile=True from libcpp.vector cimport vector -from libcpp.pair cimport pair -from cython.operator cimport dereference as deref +from libc.stdint cimport int32_t, uint64_t, uint16_t +from preshed.maps cimport PreshMap +from cymem.cymem cimport Pool from murmurhash.mrmr cimport hash64 -from libc.stdint cimport int32_t - -# try: -# from libcpp.unordered_map cimport unordered_map as umap -# except: -# from libcpp.map cimport map as umap - -from .typedefs cimport attr_t -from .typedefs cimport hash_t +from .typedefs cimport attr_t, hash_t from .structs cimport TokenC -from .tokens.doc cimport Doc, get_token_attr +from .lexeme cimport attr_id_t from .vocab cimport Vocab - +from .tokens.doc cimport Doc +from .tokens.doc cimport get_token_attr +from .attrs cimport ID, attr_id_t, NULL_ATTR from .attrs import IDS -from .attrs cimport attr_id_t, ID, NULL_ATTR from .attrs import FLAG61 as U_ENT from .attrs import FLAG60 as B2_ENT from .attrs import FLAG59 as B3_ENT @@ -54,30 +42,24 @@ from .attrs import FLAG36 as L9_ENT from .attrs import FLAG35 as L10_ENT -cpdef enum quantifier_t: - _META - ONE +cdef enum action_t: + REJECT = 0000 + MATCH = 1000 + ADVANCE = 0100 + RETRY = 0010 + RETRY_EXTEND = 0011 + MATCH_EXTEND = 1001 + MATCH_REJECT = 2000 + + +cdef enum quantifier_t: ZERO ZERO_ONE ZERO_PLUS + ONE + ONE_PLUS -cdef enum action_t: - REJECT - ADVANCE - REPEAT - ACCEPT - ADVANCE_ZERO - ADVANCE_PLUS - ACCEPT_PREV - PANIC - - -# Each token pattern consists of a quantifier and 0+ (attr, value) pairs. -# A state is an (int, pattern pointer) pair, where the int is the start -# position, and the pattern pointer shows where we're up to -# in the pattern. - cdef struct AttrValueC: attr_id_t attr attr_t value @@ -87,28 +69,231 @@ cdef struct TokenPatternC: AttrValueC* attrs int32_t nr_attr quantifier_t quantifier + hash_t key -ctypedef TokenPatternC* TokenPatternC_ptr -# ctypedef pair[int, TokenPatternC_ptr] StateC +cdef struct ActionC: + char emit_match + char next_state_next_token + char next_state_same_token + char same_state_next_token -# Match Dictionary entry type -cdef struct MatchEntryC: + +cdef struct PatternStateC: + TokenPatternC* pattern int32_t start - int32_t end - int32_t offset + int32_t length -# A state instance represents the information that defines a -# partial match -# start: the index of the first token in the partial match -# pattern: a pointer to the current token pattern in the full -# pattern -# last_match: The entry of the last span matched by the -# same pattern -cdef struct StateC: + +cdef struct MatchC: + attr_t pattern_id int32_t start - TokenPatternC_ptr pattern - MatchEntryC* last_match + int32_t length + + +cdef find_matches(TokenPatternC** patterns, int n, Doc doc): + cdef vector[PatternStateC] states + cdef vector[MatchC] matches + cdef PatternStateC state + cdef Pool mem = Pool() + # TODO: Prefill this with the extra attribute values. + extra_attrs = mem.alloc(len(doc), sizeof(attr_t*)) + # Main loop + cdef int i, j + for i in range(doc.length): + for j in range(n): + states.push_back(PatternStateC(patterns[j], i, 0)) + transition_states(states, matches, &doc.c[i], extra_attrs[i]) + # Handle matches that end in 0-width patterns + finish_states(matches, states) + return [(matches[i].pattern_id, matches[i].start, matches[i].start+matches[i].length) + for i in range(matches.size())] + + + +cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches, + const TokenC* token, const attr_t* extra_attrs) except *: + cdef int q = 0 + cdef vector[PatternStateC] new_states + for i in range(states.size()): + action = get_action(states[i], token, extra_attrs) + if action == REJECT: + continue + state = states[i] + states[q] = state + while action in (RETRY, RETRY_EXTEND): + if action == RETRY_EXTEND: + new_states.push_back( + PatternStateC(pattern=state.pattern, start=state.start, + length=state.length+1)) + states[q].pattern += 1 + action = get_action(states[q], token, extra_attrs) + if action == REJECT: + pass + elif action == ADVANCE: + states[q].pattern += 1 + states[q].length += 1 + q += 1 + else: + ent_id = state.pattern[1].attrs.value + if action == MATCH: + matches.push_back( + MatchC(pattern_id=ent_id, start=state.start, + length=state.length+1)) + elif action == MATCH_REJECT: + matches.push_back( + MatchC(pattern_id=ent_id, start=state.start, + length=state.length)) + elif action == MATCH_EXTEND: + matches.push_back( + MatchC(pattern_id=ent_id, start=state.start, + length=state.length)) + states[q].length += 1 + q += 1 + states.resize(q) + for i in range(new_states.size()): + states.push_back(new_states[i]) + + +cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states) except *: + '''Handle states that end in zero-width patterns.''' + cdef PatternStateC state + for i in range(states.size()): + state = states[i] + while get_quantifier(state) in (ZERO_PLUS, ZERO_ONE): + is_final = get_is_final(state) + if is_final: + ent_id = state.pattern[1].attrs.value + matches.push_back( + MatchC(pattern_id=ent_id, start=state.start, length=state.length)) + break + else: + state.pattern += 1 + + +cdef action_t get_action(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil: + '''We need to consider: + + a) Does the token match the specification? [Yes, No] + b) What's the quantifier? [1, 0+, ?] + c) Is this the last specification? [final, non-final] + + We can transition in the following ways: + + a) Do we emit a match? + b) Do we add a state with (next state, next token)? + c) Do we add a state with (next state, same token)? + d) Do we add a state with (same state, next token)? + + We'll code the actions as boolean strings, so 0000 means no to all 4, + 1000 means match but no states added, etc. + + 1: + Yes, final: + 1000 + Yes, non-final: + 0100 + No, final: + 0000 + No, non-final + 0000 + 0+: + Yes, final: + 1001 + Yes, non-final: + 0011 + No, final: + 1000 (note: Don't include last token!) + No, non-final: + 0010 + ?: + Yes, final: + 1000 + Yes, non-final: + 0100 + No, final: + 1000 (note: Don't include last token!) + No, non-final: + 0010 + + Possible combinations: 1000, 0100, 0000, 1001, 0011, 0010, + + We'll name the bits "match", "advance", "retry", "extend" + REJECT = 0000 + MATCH = 1000 + ADVANCE = 0100 + RETRY = 0010 + MATCH_EXTEND = 1001 + RETRY_EXTEND = 0011 + MATCH_REJECT = 2000 # Match, but don't include last token + + Problem: If a quantifier is matching, we're adding a lot of open partials + ''' + cdef char is_match + is_match = get_is_match(state, token, extra_attrs) + quantifier = get_quantifier(state) + is_final = get_is_final(state) + if quantifier == ZERO: + is_match = not is_match + quantifier = ONE + if quantifier == ONE: + if is_match and is_final: + # Yes, final: 1000 + return MATCH + elif is_match and not is_final: + # Yes, non-final: 0100 + return ADVANCE + elif not is_match and is_final: + # No, final: 0000 + return REJECT + else: + return REJECT + elif quantifier == ZERO_PLUS: + if is_match and is_final: + # Yes, final: 1001 + return MATCH_EXTEND + elif is_match and not is_final: + # Yes, non-final: 0011 + return RETRY_EXTEND + elif not is_match and is_final: + # No, final 2000 (note: Don't include last token!) + return MATCH_REJECT + else: + # No, non-final 0010 + return RETRY + elif quantifier == ZERO_ONE: + if is_match and is_final: + # Yes, final: 1000 + return MATCH + elif is_match and not is_final: + # Yes, non-final: 0100 + return ADVANCE + elif not is_match and is_final: + # No, final 2000 (note: Don't include last token!) + return MATCH_REJECT + else: + # No, non-final 0010 + return RETRY + + +cdef char get_is_match(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil: + spec = state.pattern + for attr in spec.attrs[:spec.nr_attr]: + if get_token_attr(token, attr.attr) != attr.value: + return 0 + else: + return 1 + + +cdef char get_is_final(PatternStateC state) nogil: + if state.pattern[1].attrs[0].attr == ID and state.pattern[1].nr_attr == 0: + return 1 + else: + return 0 + + +cdef char get_quantifier(PatternStateC state) nogil: + return state.pattern.quantifier cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, @@ -122,6 +307,7 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, for j, (attr, value) in enumerate(spec): pattern[i].attrs[j].attr = attr pattern[i].attrs[j].value = value + pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0) i = len(token_specs) pattern[i].attrs = mem.alloc(2, sizeof(AttrValueC)) pattern[i].attrs[0].attr = ID @@ -130,51 +316,16 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, return pattern -cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0: +cdef attr_t get_pattern_key(const TokenPatternC* pattern) nogil: while pattern.nr_attr != 0: pattern += 1 id_attr = pattern[0].attrs[0] - assert id_attr.attr == ID return id_attr.value - -cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil: - lookahead = &pattern[1] - for attr in pattern.attrs[:pattern.nr_attr]: - if get_token_attr(token, attr.attr) != attr.value: - if pattern.quantifier == ONE: - return REJECT - elif pattern.quantifier == ZERO: - return ACCEPT if lookahead.nr_attr == 0 else ADVANCE - elif pattern.quantifier in (ZERO_ONE, ZERO_PLUS): - return ACCEPT_PREV if lookahead.nr_attr == 0 else ADVANCE_ZERO - else: - return PANIC - if pattern.quantifier == ZERO: - return REJECT - elif lookahead.nr_attr == 0: - if pattern.quantifier == ZERO_PLUS: - return REPEAT - else: - return ACCEPT - elif pattern.quantifier in (ONE, ZERO_ONE): - return ADVANCE - elif pattern.quantifier == ZERO_PLUS: - # This is a bandaid over the 'shadowing' problem described here: - # https://github.com/explosion/spaCy/issues/864 - next_action = get_action(lookahead, token) - if next_action is REJECT: - return REPEAT - else: - return ADVANCE_PLUS - else: - return PANIC - - def _convert_strings(token_specs, string_store): # Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS - operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS), - '?': (ZERO_ONE,), '1': (ONE,)} + operators = {'*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS), + '?': (ZERO_ONE,), '1': (ONE,), '!': (ZERO,)} tokens = [] op = ONE for spec in token_specs: @@ -204,21 +355,6 @@ def _convert_strings(token_specs, string_store): return tokens -def merge_phrase(matcher, doc, i, matches): - """Callback to merge a phrase on match.""" - ent_id, label, start, end = matches[i] - span = doc[start:end] - span.merge(ent_type=label, ent_id=ent_id) - - -def unpickle_matcher(vocab, patterns, callbacks): - matcher = Matcher(vocab) - for key, specs in patterns.items(): - callback = callbacks.get(key, None) - matcher.add(key, callback, *specs) - return matcher - - cdef class Matcher: """Match sequences of tokens, based on pattern rules.""" cdef Pool mem @@ -339,7 +475,7 @@ cdef class Matcher: if key not in self._patterns: return default return (self._callbacks[key], self._patterns[key]) - + def pipe(self, docs, batch_size=1000, n_threads=2): """Match a stream of documents, yielding them in turn. @@ -361,231 +497,9 @@ cdef class Matcher: describing the matches. A match tuple describes a span `doc[start:end]`. The `label_id` and `key` are both integers. """ - cdef vector[StateC] partials - cdef int n_partials = 0 - cdef int q = 0 - cdef int i, token_i - cdef const TokenC* token - cdef StateC state - cdef int j = 0 - cdef int k - cdef bint overlap = False - cdef MatchEntryC* state_match - cdef MatchEntryC* last_matches = self.mem.alloc(self.patterns.size(),sizeof(MatchEntryC)) - - for i in range(self.patterns.size()): - last_matches[i].start = 0 - last_matches[i].end = 0 - last_matches[i].offset = 0 - - matches = [] - for token_i in range(doc.length): - token = &doc.c[token_i] - q = 0 - # Go over the open matches, extending or finalizing if able. - # Otherwise, we over-write them (q doesn't advance) - #for state in partials: - j=0 - while j < n_partials: - state = partials[j] - action = get_action(state.pattern, token) - j += 1 - # Skip patterns that would overlap with an existing match - # Patterns overlap an existing match if they point to the - # same final state and start between the start and end - # of said match. - # Different patterns with the same label are allowed to - # overlap. - state_match = state.last_match - if (state.start > state_match.start - and state.start < state_match.end): - continue - if action == PANIC: - raise Exception("Error selecting action in matcher") - while action == ADVANCE_ZERO: - state.pattern += 1 - action = get_action(state.pattern, token) - if action == PANIC: - raise Exception("Error selecting action in matcher") - - # ADVANCE_PLUS acts like REPEAT, but also pushes a partial that - # acts like and ADVANCE_ZERO - if action == ADVANCE_PLUS: - state.pattern += 1 - partials.push_back(state) - n_partials += 1 - state.pattern -= 1 - action = REPEAT - - if action == ADVANCE: - state.pattern += 1 - - # Check for partial matches that are at the same spec in the same pattern - # Keep the longer of the matches - # This ensures that there are never more then 2 partials for every spec - # in a pattern (one of which gets pruned in this step) - - overlap=False - for i in range(q): - if state.pattern == partials[i].pattern and state.start < partials[i].start: - partials[i] = state - j = i - overlap = True - break - if overlap: - continue - overlap=False - for i in range(q): - if state.pattern == partials[i].pattern: - overlap = True - break - if overlap: - continue - - - if action == REPEAT: - # Leave the state in the queue, and advance to next slot - # (i.e. we don't overwrite -- we want to greedily match - # more pattern. - partials[q] = state - q += 1 - elif action == REJECT: - pass - elif action == ADVANCE: - partials[q] = state - q += 1 - elif action in (ACCEPT, ACCEPT_PREV): - # TODO: What to do about patterns starting with ZERO? Need - # to adjust the start position. - start = state.start - end = token_i+1 if action == ACCEPT else token_i - ent_id = state.pattern[1].attrs[0].value - label = state.pattern[1].attrs[1].value - # Check that this match doesn't overlap with an earlier match. - # Only overwrite an earlier match if it is a substring of this - # match (i.e. it starts after this match starts). - state_match = state.last_match - - if start >= state_match.end: - state_match.start = start - state_match.end = end - state_match.offset = len(matches) - matches.append((ent_id,start,end)) - elif start <= state_match.start and end >= state_match.end: - if len(matches) == 0: - assert state_match.offset==0 - state_match.offset = 0 - matches.append((ent_id,start,end)) - else: - i = state_match.offset - matches[i] = (ent_id,start,end) - state_match.start = start - state_match.end = end - else: - pass - - partials.resize(q) - n_partials = q - # Check whether we open any new patterns on this token - i=0 - for pattern in self.patterns: - # Skip patterns that would overlap with an existing match - # state_match = pattern.last_match - state_match = &last_matches[i] - i+=1 - if (token_i > state_match.start - and token_i < state_match.end): - continue - action = get_action(pattern, token) - if action == PANIC: - raise Exception("Error selecting action in matcher") - while action in (ADVANCE_PLUS,ADVANCE_ZERO): - if action == ADVANCE_PLUS: - state.start = token_i - state.pattern = pattern - state.last_match = state_match - partials.push_back(state) - n_partials += 1 - pattern += 1 - action = get_action(pattern, token) - - if action == ADVANCE: - pattern += 1 - j=0 - overlap = False - for j in range(q): - if pattern == partials[j].pattern: - overlap = True - break - if overlap: - continue - - - if action == REPEAT: - state.start = token_i - state.pattern = pattern - state.last_match = state_match - partials.push_back(state) - n_partials += 1 - elif action == ADVANCE: - # TODO: What to do about patterns starting with ZERO? Need - # to adjust the start position. - state.start = token_i - state.pattern = pattern - state.last_match = state_match - partials.push_back(state) - n_partials += 1 - elif action in (ACCEPT, ACCEPT_PREV): - start = token_i - end = token_i+1 if action == ACCEPT else token_i - ent_id = pattern[1].attrs[0].value - - label = pattern[1].attrs[1].value - if start >= state_match.end: - state_match.start = start - state_match.end = end - state_match.offset = len(matches) - matches.append((ent_id,start,end)) - if start <= state_match.start and end >= state_match.end: - if len(matches) == 0: - state_match.offset = 0 - matches.append((ent_id,start,end)) - else: - j = state_match.offset - matches[j] = (ent_id,start,end) - state_match.start = start - state_match.end = end - else: - pass - - # Look for open patterns that are actually satisfied - for state in partials: - while state.pattern.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS): - state.pattern += 1 - if state.pattern.nr_attr == 0: - start = state.start - end = len(doc) - ent_id = state.pattern.attrs[0].value - label = state.pattern.attrs[1].value - state_match = state.last_match - if start >= state_match.end: - state_match.start = start - state_match.end = end - state_match.offset = len(matches) - matches.append((ent_id,start,end)) - if start <= state_match.start and end >= state_match.end: - j = state_match.offset - if len(matches) == 0: - state_match.offset = 0 - matches.append((ent_id,start,end)) - else: - matches[j] = (ent_id,start,end) - state_match.start = start - state_match.end = end - else: - pass - for i, (ent_id, start, end) in enumerate(matches): - on_match = self._callbacks.get(ent_id) + matches = find_matches(&self.patterns[0], self.patterns.size(), doc) + for i, (key, start, end) in enumerate(matches): + on_match = self._callbacks.get(key, None) if on_match is not None: on_match(self, doc, i, matches) return matches @@ -597,6 +511,26 @@ cdef class Matcher: return key +def unpickle_matcher(vocab, patterns, callbacks): + matcher = Matcher(vocab) + for key, specs in patterns.items(): + callback = callbacks.get(key, None) + matcher.add(key, callback, *specs) + return matcher + + +def _get_longest_matches(matches): + '''Filter out matches that have a longer equivalent.''' + longest_matches = {} + for pattern_id, start, end in matches: + key = (pattern_id, start) + length = end-start + if key not in longest_matches or length > longest_matches[key]: + longest_matches[key] = length + return [(pattern_id, start, start+length) + for (pattern_id, start), length in longest_matches.items()] + + def get_bilou(length): if length == 1: return [U_ENT] diff --git a/spacy/matcher2.pyx b/spacy/matcher2.pyx deleted file mode 100644 index 59213bfc1..000000000 --- a/spacy/matcher2.pyx +++ /dev/null @@ -1,685 +0,0 @@ -# cython: infer_types=True -# cython: profile=True -from libcpp.vector cimport vector -from libc.stdint cimport int32_t, uint64_t, uint16_t -from preshed.maps cimport PreshMap -from cymem.cymem cimport Pool -from murmurhash.mrmr cimport hash64 -from .typedefs cimport attr_t, hash_t -from .structs cimport TokenC -from .lexeme cimport attr_id_t -from .vocab cimport Vocab -from .tokens.doc cimport Doc -from .tokens.doc cimport get_token_attr -from .attrs cimport ID, attr_id_t, NULL_ATTR -from .attrs import IDS -from .attrs import FLAG61 as U_ENT -from .attrs import FLAG60 as B2_ENT -from .attrs import FLAG59 as B3_ENT -from .attrs import FLAG58 as B4_ENT -from .attrs import FLAG57 as B5_ENT -from .attrs import FLAG56 as B6_ENT -from .attrs import FLAG55 as B7_ENT -from .attrs import FLAG54 as B8_ENT -from .attrs import FLAG53 as B9_ENT -from .attrs import FLAG52 as B10_ENT -from .attrs import FLAG51 as I3_ENT -from .attrs import FLAG50 as I4_ENT -from .attrs import FLAG49 as I5_ENT -from .attrs import FLAG48 as I6_ENT -from .attrs import FLAG47 as I7_ENT -from .attrs import FLAG46 as I8_ENT -from .attrs import FLAG45 as I9_ENT -from .attrs import FLAG44 as I10_ENT -from .attrs import FLAG43 as L2_ENT -from .attrs import FLAG42 as L3_ENT -from .attrs import FLAG41 as L4_ENT -from .attrs import FLAG40 as L5_ENT -from .attrs import FLAG39 as L6_ENT -from .attrs import FLAG38 as L7_ENT -from .attrs import FLAG37 as L8_ENT -from .attrs import FLAG36 as L9_ENT -from .attrs import FLAG35 as L10_ENT - - -cdef enum action_t: - REJECT = 0000 - MATCH = 1000 - ADVANCE = 0100 - RETRY = 0010 - RETRY_EXTEND = 0011 - MATCH_EXTEND = 1001 - MATCH_REJECT = 2000 - - -cdef enum quantifier_t: - ZERO - ZERO_ONE - ZERO_PLUS - ONE - ONE_PLUS - - -cdef struct AttrValueC: - attr_id_t attr - attr_t value - - -cdef struct TokenPatternC: - AttrValueC* attrs - int32_t nr_attr - quantifier_t quantifier - hash_t key - - -cdef struct ActionC: - char emit_match - char next_state_next_token - char next_state_same_token - char same_state_next_token - - -cdef struct PatternStateC: - TokenPatternC* pattern - int32_t start - int32_t length - - -cdef struct MatchC: - attr_t pattern_id - int32_t start - int32_t length - - -cdef find_matches(TokenPatternC** patterns, int n, Doc doc): - cdef vector[PatternStateC] states - cdef vector[MatchC] matches - cdef PatternStateC state - cdef Pool mem = Pool() - # TODO: Prefill this with the extra attribute values. - extra_attrs = mem.alloc(len(doc), sizeof(attr_t*)) - # Main loop - cdef int i, j - for i in range(doc.length): - for j in range(n): - states.push_back(PatternStateC(patterns[j], i, 0)) - transition_states(states, matches, &doc.c[i], extra_attrs[i]) - # Handle matches that end in 0-width patterns - finish_states(matches, states) - return [(matches[i].pattern_id, matches[i].start, matches[i].start+matches[i].length) - for i in range(matches.size())] - - - -cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches, - const TokenC* token, const attr_t* extra_attrs) except *: - cdef int q = 0 - cdef vector[PatternStateC] new_states - for i in range(states.size()): - action = get_action(states[i], token, extra_attrs) - if action == REJECT: - continue - state = states[i] - states[q] = state - while action in (RETRY, RETRY_EXTEND): - if action == RETRY_EXTEND: - new_states.push_back( - PatternStateC(pattern=state.pattern, start=state.start, - length=state.length+1)) - states[q].pattern += 1 - action = get_action(states[q], token, extra_attrs) - if action == REJECT: - pass - elif action == ADVANCE: - states[q].pattern += 1 - states[q].length += 1 - q += 1 - else: - ent_id = state.pattern[1].attrs.value - if action == MATCH: - matches.push_back( - MatchC(pattern_id=ent_id, start=state.start, - length=state.length+1)) - elif action == MATCH_REJECT: - matches.push_back( - MatchC(pattern_id=ent_id, start=state.start, - length=state.length)) - elif action == MATCH_EXTEND: - matches.push_back( - MatchC(pattern_id=ent_id, start=state.start, - length=state.length)) - states[q].length += 1 - q += 1 - states.resize(q) - for i in range(new_states.size()): - states.push_back(new_states[i]) - - -cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states) except *: - '''Handle states that end in zero-width patterns.''' - cdef PatternStateC state - for i in range(states.size()): - state = states[i] - while get_quantifier(state) in (ZERO_PLUS, ZERO_ONE): - is_final = get_is_final(state) - if is_final: - ent_id = state.pattern[1].attrs.value - matches.push_back( - MatchC(pattern_id=ent_id, start=state.start, length=state.length)) - break - else: - state.pattern += 1 - - -cdef action_t get_action(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil: - '''We need to consider: - - a) Does the token match the specification? [Yes, No] - b) What's the quantifier? [1, 0+, ?] - c) Is this the last specification? [final, non-final] - - We can transition in the following ways: - - a) Do we emit a match? - b) Do we add a state with (next state, next token)? - c) Do we add a state with (next state, same token)? - d) Do we add a state with (same state, next token)? - - We'll code the actions as boolean strings, so 0000 means no to all 4, - 1000 means match but no states added, etc. - - 1: - Yes, final: - 1000 - Yes, non-final: - 0100 - No, final: - 0000 - No, non-final - 0000 - 0+: - Yes, final: - 1001 - Yes, non-final: - 0011 - No, final: - 1000 (note: Don't include last token!) - No, non-final: - 0010 - ?: - Yes, final: - 1000 - Yes, non-final: - 0100 - No, final: - 1000 (note: Don't include last token!) - No, non-final: - 0010 - - Possible combinations: 1000, 0100, 0000, 1001, 0011, 0010, - - We'll name the bits "match", "advance", "retry", "extend" - REJECT = 0000 - MATCH = 1000 - ADVANCE = 0100 - RETRY = 0010 - MATCH_EXTEND = 1001 - RETRY_EXTEND = 0011 - MATCH_REJECT = 2000 # Match, but don't include last token - - Problem: If a quantifier is matching, we're adding a lot of open partials - ''' - cdef char is_match - is_match = get_is_match(state, token, extra_attrs) - quantifier = get_quantifier(state) - is_final = get_is_final(state) - if quantifier == ZERO: - is_match = not is_match - quantifier = ONE - if quantifier == ONE: - if is_match and is_final: - # Yes, final: 1000 - return MATCH - elif is_match and not is_final: - # Yes, non-final: 0100 - return ADVANCE - elif not is_match and is_final: - # No, final: 0000 - return REJECT - else: - return REJECT - elif quantifier == ZERO_PLUS: - if is_match and is_final: - # Yes, final: 1001 - return MATCH_EXTEND - elif is_match and not is_final: - # Yes, non-final: 0011 - return RETRY_EXTEND - elif not is_match and is_final: - # No, final 2000 (note: Don't include last token!) - return MATCH_REJECT - else: - # No, non-final 0010 - return RETRY - elif quantifier == ZERO_ONE: - if is_match and is_final: - # Yes, final: 1000 - return MATCH - elif is_match and not is_final: - # Yes, non-final: 0100 - return ADVANCE - elif not is_match and is_final: - # No, final 2000 (note: Don't include last token!) - return MATCH_REJECT - else: - # No, non-final 0010 - return RETRY - - -cdef char get_is_match(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil: - spec = state.pattern - for attr in spec.attrs[:spec.nr_attr]: - if get_token_attr(token, attr.attr) != attr.value: - return 0 - else: - return 1 - - -cdef char get_is_final(PatternStateC state) nogil: - if state.pattern[1].attrs[0].attr == ID and state.pattern[1].nr_attr == 0: - return 1 - else: - return 0 - - -cdef char get_quantifier(PatternStateC state) nogil: - return state.pattern.quantifier - - -cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, - object token_specs) except NULL: - pattern = mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC)) - cdef int i - for i, (quantifier, spec) in enumerate(token_specs): - pattern[i].quantifier = quantifier - pattern[i].attrs = mem.alloc(len(spec), sizeof(AttrValueC)) - pattern[i].nr_attr = len(spec) - for j, (attr, value) in enumerate(spec): - pattern[i].attrs[j].attr = attr - pattern[i].attrs[j].value = value - pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0) - i = len(token_specs) - pattern[i].attrs = mem.alloc(2, sizeof(AttrValueC)) - pattern[i].attrs[0].attr = ID - pattern[i].attrs[0].value = entity_id - pattern[i].nr_attr = 0 - return pattern - - -cdef attr_t get_pattern_key(const TokenPatternC* pattern) nogil: - while pattern.nr_attr != 0: - pattern += 1 - id_attr = pattern[0].attrs[0] - return id_attr.value - -def _convert_strings(token_specs, string_store): - # Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS - operators = {'*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS), - '?': (ZERO_ONE,), '1': (ONE,), '!': (ZERO,)} - tokens = [] - op = ONE - for spec in token_specs: - if not spec: - # Signifier for 'any token' - tokens.append((ONE, [(NULL_ATTR, 0)])) - continue - token = [] - ops = (ONE,) - for attr, value in spec.items(): - if isinstance(attr, basestring) and attr.upper() == 'OP': - if value in operators: - ops = operators[value] - else: - msg = "Unknown operator '%s'. Options: %s" - raise KeyError(msg % (value, ', '.join(operators.keys()))) - if isinstance(attr, basestring): - attr = IDS.get(attr.upper()) - if isinstance(value, basestring): - value = string_store.add(value) - if isinstance(value, bool): - value = int(value) - if attr is not None: - token.append((attr, value)) - for op in ops: - tokens.append((op, token)) - return tokens - - -cdef class Matcher: - """Match sequences of tokens, based on pattern rules.""" - cdef Pool mem - cdef vector[TokenPatternC*] patterns - cdef readonly Vocab vocab - cdef public object _patterns - cdef public object _entities - cdef public object _callbacks - - def __init__(self, vocab): - """Create the Matcher. - - vocab (Vocab): The vocabulary object, which must be shared with the - documents the matcher will operate on. - RETURNS (Matcher): The newly constructed object. - """ - self._patterns = {} - self._entities = {} - self._callbacks = {} - self.vocab = vocab - self.mem = Pool() - - def __reduce__(self): - data = (self.vocab, self._patterns, self._callbacks) - return (unpickle_matcher, data, None, None) - - def __len__(self): - """Get the number of rules added to the matcher. Note that this only - returns the number of rules (identical with the number of IDs), not the - number of individual patterns. - - RETURNS (int): The number of rules. - """ - return len(self._patterns) - - def __contains__(self, key): - """Check whether the matcher contains rules for a match ID. - - key (unicode): The match ID. - RETURNS (bool): Whether the matcher contains rules for this match ID. - """ - return self._normalize_key(key) in self._patterns - - def add(self, key, on_match, *patterns): - """Add a match-rule to the matcher. A match-rule consists of: an ID - key, an on_match callback, and one or more patterns. - - If the key exists, the patterns are appended to the previous ones, and - the previous on_match callback is replaced. The `on_match` callback - will receive the arguments `(matcher, doc, i, matches)`. You can also - set `on_match` to `None` to not perform any actions. - - A pattern consists of one or more `token_specs`, where a `token_spec` - is a dictionary mapping attribute IDs to values, and optionally a - quantifier operator under the key "op". The available quantifiers are: - - '!': Negate the pattern, by requiring it to match exactly 0 times. - '?': Make the pattern optional, by allowing it to match 0 or 1 times. - '+': Require the pattern to match 1 or more times. - '*': Allow the pattern to zero or more times. - - The + and * operators are usually interpretted "greedily", i.e. longer - matches are returned where possible. However, if you specify two '+' - and '*' patterns in a row and their matches overlap, the first - operator will behave non-greedily. This quirk in the semantics makes - the matcher more efficient, by avoiding the need for back-tracking. - - key (unicode): The match ID. - on_match (callable): Callback executed on match. - *patterns (list): List of token descritions. - """ - for pattern in patterns: - if len(pattern) == 0: - msg = ("Cannot add pattern for zero tokens to matcher.\n" - "key: {key}\n") - raise ValueError(msg.format(key=key)) - key = self._normalize_key(key) - for pattern in patterns: - specs = _convert_strings(pattern, self.vocab.strings) - self.patterns.push_back(init_pattern(self.mem, key, specs)) - self._patterns.setdefault(key, []) - self._callbacks[key] = on_match - self._patterns[key].extend(patterns) - - def remove(self, key): - """Remove a rule from the matcher. A KeyError is raised if the key does - not exist. - - key (unicode): The ID of the match rule. - """ - key = self._normalize_key(key) - self._patterns.pop(key) - self._callbacks.pop(key) - cdef int i = 0 - while i < self.patterns.size(): - pattern_key = get_pattern_key(self.patterns.at(i)) - if pattern_key == key: - self.patterns.erase(self.patterns.begin()+i) - else: - i += 1 - - def has_key(self, key): - """Check whether the matcher has a rule with a given key. - - key (string or int): The key to check. - RETURNS (bool): Whether the matcher has the rule. - """ - key = self._normalize_key(key) - return key in self._patterns - - def get(self, key, default=None): - """Retrieve the pattern stored for a key. - - key (unicode or int): The key to retrieve. - RETURNS (tuple): The rule, as an (on_match, patterns) tuple. - """ - key = self._normalize_key(key) - if key not in self._patterns: - return default - return (self._callbacks[key], self._patterns[key]) - - def pipe(self, docs, batch_size=1000, n_threads=2): - """Match a stream of documents, yielding them in turn. - - docs (iterable): A stream of documents. - batch_size (int): Number of documents to accumulate into a working set. - n_threads (int): The number of threads with which to work on the buffer - in parallel, if the implementation supports multi-threading. - YIELDS (Doc): Documents, in order. - """ - for doc in docs: - self(doc) - yield doc - - def __call__(self, Doc doc): - """Find all token sequences matching the supplied pattern. - - doc (Doc): The document to match over. - RETURNS (list): A list of `(key, start, end)` tuples, - describing the matches. A match tuple describes a span - `doc[start:end]`. The `label_id` and `key` are both integers. - """ - matches = find_matches(&self.patterns[0], self.patterns.size(), doc) - for i, (key, start, end) in enumerate(matches): - on_match = self._callbacks.get(key, None) - if on_match is not None: - on_match(self, doc, i, matches) - return matches - - def _normalize_key(self, key): - if isinstance(key, basestring): - return self.vocab.strings.add(key) - else: - return key - - -def unpickle_matcher(vocab, patterns, callbacks): - matcher = Matcher(vocab) - for key, specs in patterns.items(): - callback = callbacks.get(key, None) - matcher.add(key, callback, *specs) - return matcher - - -def _get_longest_matches(matches): - '''Filter out matches that have a longer equivalent.''' - longest_matches = {} - for pattern_id, start, end in matches: - key = (pattern_id, start) - length = end-start - if key not in longest_matches or length > longest_matches[key]: - longest_matches[key] = length - return [(pattern_id, start, start+length) - for (pattern_id, start), length in longest_matches.items()] - - -def get_bilou(length): - if length == 1: - return [U_ENT] - elif length == 2: - return [B2_ENT, L2_ENT] - elif length == 3: - return [B3_ENT, I3_ENT, L3_ENT] - elif length == 4: - return [B4_ENT, I4_ENT, I4_ENT, L4_ENT] - elif length == 5: - return [B5_ENT, I5_ENT, I5_ENT, I5_ENT, L5_ENT] - elif length == 6: - return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT] - elif length == 7: - return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT] - elif length == 8: - return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT] - elif length == 9: - return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, - L9_ENT] - elif length == 10: - return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, - I10_ENT, I10_ENT, L10_ENT] - else: - raise ValueError("Max length currently 10 for phrase matching") - - -cdef class PhraseMatcher: - cdef Pool mem - cdef Vocab vocab - cdef Matcher matcher - cdef PreshMap phrase_ids - cdef int max_length - cdef attr_t* _phrase_key - cdef public object _callbacks - cdef public object _patterns - - def __init__(self, Vocab vocab, max_length=10): - self.mem = Pool() - self._phrase_key = self.mem.alloc(max_length, sizeof(attr_t)) - self.max_length = max_length - self.vocab = vocab - self.matcher = Matcher(self.vocab) - self.phrase_ids = PreshMap() - abstract_patterns = [] - for length in range(1, max_length): - abstract_patterns.append([{tag: True} - for tag in get_bilou(length)]) - self.matcher.add('Candidate', None, *abstract_patterns) - self._callbacks = {} - - def __len__(self): - """Get the number of rules added to the matcher. Note that this only - returns the number of rules (identical with the number of IDs), not the - number of individual patterns. - - RETURNS (int): The number of rules. - """ - return len(self.phrase_ids) - - def __contains__(self, key): - """Check whether the matcher contains rules for a match ID. - - key (unicode): The match ID. - RETURNS (bool): Whether the matcher contains rules for this match ID. - """ - cdef hash_t ent_id = self.matcher._normalize_key(key) - return ent_id in self._callbacks - - def __reduce__(self): - return (self.__class__, (self.vocab,), None, None) - - def add(self, key, on_match, *docs): - """Add a match-rule to the matcher. A match-rule consists of: an ID - key, an on_match callback, and one or more patterns. - - key (unicode): The match ID. - on_match (callable): Callback executed on match. - *docs (Doc): `Doc` objects representing match patterns. - """ - cdef Doc doc - for doc in docs: - if len(doc) >= self.max_length: - msg = ( - "Pattern length (%d) >= phrase_matcher.max_length (%d). " - "Length can be set on initialization, up to 10." - ) - raise ValueError(msg % (len(doc), self.max_length)) - cdef hash_t ent_id = self.matcher._normalize_key(key) - self._callbacks[ent_id] = on_match - cdef int length - cdef int i - cdef hash_t phrase_hash - for doc in docs: - length = doc.length - tags = get_bilou(length) - for i in range(self.max_length): - self._phrase_key[i] = 0 - for i, tag in enumerate(tags): - lexeme = self.vocab[doc.c[i].lex.orth] - lexeme.set_flag(tag, True) - self._phrase_key[i] = lexeme.orth - phrase_hash = hash64(self._phrase_key, - self.max_length * sizeof(attr_t), 0) - self.phrase_ids.set(phrase_hash, ent_id) - - def __call__(self, Doc doc): - """Find all sequences matching the supplied patterns on the `Doc`. - - doc (Doc): The document to match over. - RETURNS (list): A list of `(key, start, end)` tuples, - describing the matches. A match tuple describes a span - `doc[start:end]`. The `label_id` and `key` are both integers. - """ - matches = [] - for _, start, end in self.matcher(doc): - ent_id = self.accept_match(doc, start, end) - if ent_id is not None: - matches.append((ent_id, start, end)) - for i, (ent_id, start, end) in enumerate(matches): - on_match = self._callbacks.get(ent_id) - if on_match is not None: - on_match(self, doc, i, matches) - return matches - - def pipe(self, stream, batch_size=1000, n_threads=2): - """Match a stream of documents, yielding them in turn. - - docs (iterable): A stream of documents. - batch_size (int): Number of documents to accumulate into a working set. - n_threads (int): The number of threads with which to work on the buffer - in parallel, if the implementation supports multi-threading. - YIELDS (Doc): Documents, in order. - """ - for doc in stream: - self(doc) - yield doc - - def accept_match(self, Doc doc, int start, int end): - assert (end - start) < self.max_length - cdef int i, j - for i in range(self.max_length): - self._phrase_key[i] = 0 - for i, j in enumerate(range(start, end)): - self._phrase_key[i] = doc.c[j].lex.orth - cdef hash_t key = hash64(self._phrase_key, - self.max_length * sizeof(attr_t), 0) - ent_id = self.phrase_ids.get(key) - if ent_id == 0: - return None - else: - return ent_id From 4533c7408d3b15b133773dd3ccef742f3d293432 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 15 Feb 2018 15:39:47 +0100 Subject: [PATCH 20/23] Update matcher tests --- spacy/tests/regression/test_issue1450.py | 6 +++--- spacy/tests/regression/test_issue1855.py | 2 +- spacy/tests/regression/test_issue1883.py | 2 +- spacy/tests/regression/test_issue1945.py | 3 +-- spacy/tests/regression/test_issue850.py | 2 +- spacy/tests/test_matcher.py | 12 +++++------- 6 files changed, 12 insertions(+), 15 deletions(-) diff --git a/spacy/tests/regression/test_issue1450.py b/spacy/tests/regression/test_issue1450.py index d099763d2..1609f71f5 100644 --- a/spacy/tests/regression/test_issue1450.py +++ b/spacy/tests/regression/test_issue1450.py @@ -1,7 +1,7 @@ from __future__ import unicode_literals import pytest -from ...matcher2 import Matcher +from ...matcher import Matcher from ...tokens import Doc from ...vocab import Vocab @@ -54,5 +54,5 @@ def test_issue1450_matcher_end_zero_plus(string, start, end): if start is None or end is None: assert matches == [] - assert matches[0][1] == start - assert matches[0][2] == end + assert matches[-1][1] == start + assert matches[-1][2] == end diff --git a/spacy/tests/regression/test_issue1855.py b/spacy/tests/regression/test_issue1855.py index e10af0d60..b12b5c251 100644 --- a/spacy/tests/regression/test_issue1855.py +++ b/spacy/tests/regression/test_issue1855.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import re -from ...matcher2 import Matcher +from ...matcher import Matcher import pytest diff --git a/spacy/tests/regression/test_issue1883.py b/spacy/tests/regression/test_issue1883.py index 1c7393d8d..3fcf905c1 100644 --- a/spacy/tests/regression/test_issue1883.py +++ b/spacy/tests/regression/test_issue1883.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import copy from ... vocab import Vocab -from ... matcher2 import Matcher +from ... matcher import Matcher from ... tokens import Doc diff --git a/spacy/tests/regression/test_issue1945.py b/spacy/tests/regression/test_issue1945.py index 59135033a..052f699fb 100644 --- a/spacy/tests/regression/test_issue1945.py +++ b/spacy/tests/regression/test_issue1945.py @@ -4,9 +4,8 @@ import pytest from ...vocab import Vocab from ...tokens import Doc -from ...matcher2 import Matcher +from ...matcher import Matcher -#@pytest.mark.xfail def test_issue1945(): text = "a a a" matcher = Matcher(Vocab()) diff --git a/spacy/tests/regression/test_issue850.py b/spacy/tests/regression/test_issue850.py index e3611c4a6..e83b4d8af 100644 --- a/spacy/tests/regression/test_issue850.py +++ b/spacy/tests/regression/test_issue850.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import pytest -from ...matcher2 import Matcher +from ...matcher import Matcher from ...vocab import Vocab from ...attrs import LOWER from ...tokens import Doc diff --git a/spacy/tests/test_matcher.py b/spacy/tests/test_matcher.py index d585a9255..521121861 100644 --- a/spacy/tests/test_matcher.py +++ b/spacy/tests/test_matcher.py @@ -1,8 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -from ..matcher2 import Matcher -from ..matcher2 import PhraseMatcher +from ..matcher import Matcher, PhraseMatcher from .util import get_doc from ..tokens import Doc @@ -254,9 +253,8 @@ def test_matcher_end_zero_plus(matcher): ) nlp = lambda string: Doc(matcher.vocab, words=string.split()) assert len(matcher(nlp(u'a'))) == 1 - assert len(matcher(nlp(u'a b'))) == 1 - assert len(matcher(nlp(u'a b'))) == 1 + assert len(matcher(nlp(u'a b'))) == 2 assert len(matcher(nlp(u'a c'))) == 1 - assert len(matcher(nlp(u'a b c'))) == 1 - assert len(matcher(nlp(u'a b b c'))) == 1 - assert len(matcher(nlp(u'a b b'))) == 1 + assert len(matcher(nlp(u'a b c'))) == 2 + assert len(matcher(nlp(u'a b b c'))) == 3 + assert len(matcher(nlp(u'a b b'))) == 3 From afbd46adfb4e9532cfb58d3c86cd95e684ca8269 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 15 Feb 2018 16:10:54 +0100 Subject: [PATCH 21/23] Remove length cap in PhraseMatcher --- spacy/matcher.pyx | 64 ++++++++++++++++------------------------------- 1 file changed, 22 insertions(+), 42 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 59213bfc1..b9d7ea5f4 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -532,30 +532,16 @@ def _get_longest_matches(matches): def get_bilou(length): - if length == 1: + if length == 0: + raise ValueError("Length must be >= 1") + elif length == 1: return [U_ENT] elif length == 2: return [B2_ENT, L2_ENT] elif length == 3: return [B3_ENT, I3_ENT, L3_ENT] - elif length == 4: - return [B4_ENT, I4_ENT, I4_ENT, L4_ENT] - elif length == 5: - return [B5_ENT, I5_ENT, I5_ENT, I5_ENT, L5_ENT] - elif length == 6: - return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT] - elif length == 7: - return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT] - elif length == 8: - return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT] - elif length == 9: - return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, - L9_ENT] - elif length == 10: - return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, - I10_ENT, I10_ENT, L10_ENT] else: - raise ValueError("Max length currently 10 for phrase matching") + return [B4_ENT, I4_ENT] + [I4_ENT] * (length-3) + [L4_ENT] cdef class PhraseMatcher: @@ -564,21 +550,21 @@ cdef class PhraseMatcher: cdef Matcher matcher cdef PreshMap phrase_ids cdef int max_length - cdef attr_t* _phrase_key cdef public object _callbacks cdef public object _patterns def __init__(self, Vocab vocab, max_length=10): self.mem = Pool() - self._phrase_key = self.mem.alloc(max_length, sizeof(attr_t)) self.max_length = max_length self.vocab = vocab self.matcher = Matcher(self.vocab) self.phrase_ids = PreshMap() - abstract_patterns = [] - for length in range(1, max_length): - abstract_patterns.append([{tag: True} - for tag in get_bilou(length)]) + abstract_patterns = [ + [{U_ENT: True}], + [{B2_ENT: True}, {L2_ENT: True}], + [{B3_ENT: True}, {I3_ENT: True}, {L3_ENT: True}], + [{B4_ENT: True}, {I4_ENT: True}, {I4_ENT: True, "OP": "+"}, {L4_ENT: True}], + ] self.matcher.add('Candidate', None, *abstract_patterns) self._callbacks = {} @@ -612,29 +598,24 @@ cdef class PhraseMatcher: *docs (Doc): `Doc` objects representing match patterns. """ cdef Doc doc - for doc in docs: - if len(doc) >= self.max_length: - msg = ( - "Pattern length (%d) >= phrase_matcher.max_length (%d). " - "Length can be set on initialization, up to 10." - ) - raise ValueError(msg % (len(doc), self.max_length)) cdef hash_t ent_id = self.matcher._normalize_key(key) self._callbacks[ent_id] = on_match cdef int length cdef int i cdef hash_t phrase_hash + cdef Pool mem = Pool() for doc in docs: length = doc.length + if length == 0: + continue tags = get_bilou(length) - for i in range(self.max_length): - self._phrase_key[i] = 0 + phrase_key = mem.alloc(length, sizeof(attr_t)) for i, tag in enumerate(tags): lexeme = self.vocab[doc.c[i].lex.orth] lexeme.set_flag(tag, True) - self._phrase_key[i] = lexeme.orth - phrase_hash = hash64(self._phrase_key, - self.max_length * sizeof(attr_t), 0) + phrase_key[i] = lexeme.orth + phrase_hash = hash64(phrase_key, + length * sizeof(attr_t), 0) self.phrase_ids.set(phrase_hash, ent_id) def __call__(self, Doc doc): @@ -670,14 +651,13 @@ cdef class PhraseMatcher: yield doc def accept_match(self, Doc doc, int start, int end): - assert (end - start) < self.max_length cdef int i, j - for i in range(self.max_length): - self._phrase_key[i] = 0 + cdef Pool mem = Pool() + phrase_key = mem.alloc(end-start, sizeof(attr_t)) for i, j in enumerate(range(start, end)): - self._phrase_key[i] = doc.c[j].lex.orth - cdef hash_t key = hash64(self._phrase_key, - self.max_length * sizeof(attr_t), 0) + phrase_key[i] = doc.c[j].lex.orth + cdef hash_t key = hash64(phrase_key, + (end-start) * sizeof(attr_t), 0) ent_id = self.phrase_ids.get(key) if ent_id == 0: return None From 70cd94f8660bdaa29d6a16f78071a0642c974baa Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 18 Feb 2018 13:46:00 +0100 Subject: [PATCH 22/23] Remove matcher2 from setup.py --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index db20f8ee6..7c26a7491 100755 --- a/setup.py +++ b/setup.py @@ -38,7 +38,6 @@ MOD_NAMES = [ 'spacy.tokens.span', 'spacy.tokens.token', 'spacy.matcher', - 'spacy.matcher2', 'spacy.syntax.ner', 'spacy.symbols', 'spacy.vectors', From 2bccad88152272af36c13973098695efd52a6bdd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 18 Feb 2018 14:56:12 +0100 Subject: [PATCH 23/23] Fix incorrect matcher test --- spacy/tests/regression/test_issue1450.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/tests/regression/test_issue1450.py b/spacy/tests/regression/test_issue1450.py index cde5ce3ca..3cfec349f 100644 --- a/spacy/tests/regression/test_issue1450.py +++ b/spacy/tests/regression/test_issue1450.py @@ -13,8 +13,8 @@ from ...vocab import Vocab ('a b', 0, 2), ('a c', 0, 1), ('a b c', 0, 2), - ('a b b c', 0, 2), - ('a b b', 0, 2), + ('a b b c', 0, 3), + ('a b b', 0, 3), ] ) def test_issue1450_matcher_end_zero_plus(string, start, end): @@ -54,5 +54,6 @@ def test_issue1450_matcher_end_zero_plus(string, start, end): if start is None or end is None: assert matches == [] + print(matches) assert matches[-1][1] == start assert matches[-1][2] == end