From 0d9740e8266cfbdc74d15c708427f759f1f02588 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 19 Sep 2019 16:36:12 +0200 Subject: [PATCH 01/12] Replace PhraseMatcher with Aho-Corasick Replace PhraseMatcher with the Aho-Corasick algorithm over numpy arrays of the hash values for the relevant attribute. The implementation is based on FlashText. The speed should be similar to the previous PhraseMatcher. It is now possible to easily remove match IDs and matches don't go missing with large keyword lists / vocabularies. Fixes #4308. --- spacy/matcher/phrasematcher.pxd | 5 - spacy/matcher/phrasematcher.pyx | 267 +++++++++++---------- spacy/tests/matcher/test_phrase_matcher.py | 52 ++++ 3 files changed, 189 insertions(+), 135 deletions(-) diff --git a/spacy/matcher/phrasematcher.pxd b/spacy/matcher/phrasematcher.pxd index 3aba1686f..e69de29bb 100644 --- a/spacy/matcher/phrasematcher.pxd +++ b/spacy/matcher/phrasematcher.pxd @@ -1,5 +0,0 @@ -from libcpp.vector cimport vector - -from ..typedefs cimport hash_t - -ctypedef vector[hash_t] hash_vec diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 9e8801cc1..76a66c506 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -2,28 +2,14 @@ # cython: profile=True from __future__ import unicode_literals -from libcpp.vector cimport vector -from cymem.cymem cimport Pool -from murmurhash.mrmr cimport hash64 -from preshed.maps cimport PreshMap +import numpy as np -from .matcher cimport Matcher from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, attr_id_t from ..vocab cimport Vocab from ..tokens.doc cimport Doc, get_token_attr -from ..typedefs cimport attr_t, hash_t from ._schemas import TOKEN_PATTERN_SCHEMA from ..errors import Errors, Warnings, deprecation_warning, user_warning -from ..attrs import FLAG61 as U_ENT -from ..attrs import FLAG60 as B2_ENT -from ..attrs import FLAG59 as B3_ENT -from ..attrs import FLAG58 as B4_ENT -from ..attrs import FLAG43 as L2_ENT -from ..attrs import FLAG42 as L3_ENT -from ..attrs import FLAG41 as L4_ENT -from ..attrs import FLAG42 as I3_ENT -from ..attrs import FLAG41 as I4_ENT cdef class PhraseMatcher: @@ -33,18 +19,18 @@ cdef class PhraseMatcher: DOCS: https://spacy.io/api/phrasematcher USAGE: https://spacy.io/usage/rule-based-matching#phrasematcher + + Adapted from FlashText: https://github.com/vi3k6i5/flashtext + MIT License (see `LICENSE`) + Copyright (c) 2017 Vikash Singh (vikash.duliajan@gmail.com) """ - cdef Pool mem cdef Vocab vocab - cdef Matcher matcher - cdef PreshMap phrase_ids - cdef vector[hash_vec] ent_id_matrix - cdef int max_length + cdef unicode _terminal + cdef object keyword_trie_dict cdef attr_id_t attr - cdef public object _callbacks - cdef public object _patterns - cdef public object _docs - cdef public object _validate + cdef object _callbacks + cdef object _keywords + cdef bint _validate def __init__(self, Vocab vocab, max_length=0, attr="ORTH", validate=False): """Initialize the PhraseMatcher. @@ -58,10 +44,13 @@ cdef class PhraseMatcher: """ if max_length != 0: deprecation_warning(Warnings.W010) - self.mem = Pool() - self.max_length = max_length self.vocab = vocab - self.matcher = Matcher(self.vocab, validate=False) + self._terminal = '_terminal_' + self.keyword_trie_dict = dict() + self._callbacks = {} + self._keywords = {} + self._validate = validate + if isinstance(attr, long): self.attr = attr else: @@ -71,28 +60,15 @@ cdef class PhraseMatcher: if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]: raise ValueError(Errors.E152.format(attr=attr)) self.attr = self.vocab.strings[attr] - self.phrase_ids = PreshMap() - abstract_patterns = [ - [{U_ENT: True}], - [{B2_ENT: True}, {L2_ENT: True}], - [{B3_ENT: True}, {I3_ENT: True}, {L3_ENT: True}], - [{B4_ENT: True}, {I4_ENT: True}, {I4_ENT: True, "OP": "+"}, {L4_ENT: True}], - ] - self.matcher.add("Candidate", None, *abstract_patterns) - self._callbacks = {} - self._docs = {} - self._validate = validate def __len__(self): - """Get the number of rules added to the matcher. Note that this only - returns the number of rules (identical with the number of IDs), not the - number of individual patterns. + """Get the number of match IDs added to the matcher. RETURNS (int): The number of rules. DOCS: https://spacy.io/api/phrasematcher#len """ - return len(self._docs) + return len(self._callbacks) def __contains__(self, key): """Check whether the matcher contains rules for a match ID. @@ -102,12 +78,48 @@ cdef class PhraseMatcher: DOCS: https://spacy.io/api/phrasematcher#contains """ - cdef hash_t ent_id = self.matcher._normalize_key(key) - return ent_id in self._callbacks + return key in self._callbacks - def __reduce__(self): - data = (self.vocab, self._docs, self._callbacks) - return (unpickle_matcher, data, None, None) + def remove(self, key): + """Remove a match-rule from the matcher by match ID. + + key (unicode): The match ID. + """ + if key not in self._keywords: + return + for keyword in self._keywords[key]: + current_dict = self.keyword_trie_dict + token_trie_list = [] + for tokens in keyword: + if tokens in current_dict: + token_trie_list.append((tokens, current_dict)) + current_dict = current_dict[tokens] + else: + # if token is not found, break out of the loop + current_dict = None + break + # remove the tokens from trie dict if there are no other + # keywords with them + if current_dict and self._terminal in current_dict: + # if this is the only remaining key, remove unnecessary paths + if current_dict[self._terminal] == [key]: + # we found a complete match for input keyword + token_trie_list.append((self._terminal, current_dict)) + token_trie_list.reverse() + for key_to_remove, dict_pointer in token_trie_list: + if len(dict_pointer.keys()) == 1: + dict_pointer.pop(key_to_remove) + else: + # more than one key means more than 1 path, + # delete not required path and keep the other + dict_pointer.pop(key_to_remove) + break + # otherwise simply remove the key + else: + current_dict[self._terminal].remove(key) + + del self._keywords[key] + del self._callbacks[key] def add(self, key, on_match, *docs): """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID @@ -119,17 +131,13 @@ cdef class PhraseMatcher: DOCS: https://spacy.io/api/phrasematcher#add """ - cdef Doc doc - cdef hash_t ent_id = self.matcher._normalize_key(key) - self._callbacks[ent_id] = on_match - self._docs[ent_id] = docs - cdef int length - cdef int i - cdef hash_t phrase_hash - cdef Pool mem = Pool() + + _ = self.vocab[key] + self._callbacks[key] = on_match + self._keywords.setdefault(key, []) + for doc in docs: - length = doc.length - if length == 0: + if len(doc) == 0: continue if self.attr in (POS, TAG, LEMMA) and not doc.is_tagged: raise ValueError(Errors.E155.format()) @@ -139,33 +147,18 @@ cdef class PhraseMatcher: and self.attr not in (DEP, POS, TAG, LEMMA): string_attr = self.vocab.strings[self.attr] user_warning(Warnings.W012.format(key=key, attr=string_attr)) - tags = get_biluo(length) - phrase_key = mem.alloc(length, sizeof(attr_t)) - for i, tag in enumerate(tags): - attr_value = self.get_lex_value(doc, i) - lexeme = self.vocab[attr_value] - lexeme.set_flag(tag, True) - phrase_key[i] = lexeme.orth - phrase_hash = hash64(phrase_key, length * sizeof(attr_t), 0) + keyword = self._convert_to_array(doc) + # keep track of keywords per key to make remove easier + # (would use a set, but can't hash numpy arrays) + if keyword not in self._keywords[key]: + self._keywords[key].append(keyword) + current_dict = self.keyword_trie_dict + for token in keyword: + current_dict = current_dict.setdefault(token, {}) + current_dict.setdefault(self._terminal, set()) + current_dict[self._terminal].add(key) - if phrase_hash in self.phrase_ids: - phrase_index = self.phrase_ids[phrase_hash] - ent_id_list = self.ent_id_matrix[phrase_index] - ent_id_list.append(ent_id) - self.ent_id_matrix[phrase_index] = ent_id_list - - else: - ent_id_list = hash_vec(1) - ent_id_list[0] = ent_id - new_index = self.ent_id_matrix.size() - if new_index == 0: - # PreshMaps can not contain 0 as value, so storing a dummy at 0 - self.ent_id_matrix.push_back(hash_vec(0)) - new_index = 1 - self.ent_id_matrix.push_back(ent_id_list) - self.phrase_ids.set(phrase_hash, new_index) - - def __call__(self, Doc doc): + def __call__(self, doc): """Find all sequences matching the supplied patterns on the `Doc`. doc (Doc): The document to match over. @@ -175,20 +168,62 @@ cdef class PhraseMatcher: DOCS: https://spacy.io/api/phrasematcher#call """ + doc_array = self._convert_to_array(doc) matches = [] - if self.attr == ORTH: - match_doc = doc - else: - # If we're not matching on the ORTH, match_doc will be a Doc whose - # token.orth values are the attribute values we're matching on, - # e.g. Doc(nlp.vocab, words=[token.pos_ for token in doc]) - words = [self.get_lex_value(doc, i) for i in range(len(doc))] - match_doc = Doc(self.vocab, words=words) - for _, start, end in self.matcher(match_doc): - ent_ids = self.accept_match(match_doc, start, end) - if ent_ids is not None: - for ent_id in ent_ids: - matches.append((ent_id, start, end)) + if doc_array is None or len(doc_array) == 0: + # if doc_array is empty or None just return empty list + return matches + current_dict = self.keyword_trie_dict + start = 0 + reset_current_dict = False + idx = 0 + doc_array_len = len(doc_array) + while idx < doc_array_len: + token = doc_array[idx] + # if end is present in current_dict + if self._terminal in current_dict or token in current_dict: + if self._terminal in current_dict: + ent_id = current_dict[self._terminal] + matches.append((self.vocab.strings[ent_id], start, idx)) + + # look for longer sequences from this position + if token in current_dict: + current_dict_continued = current_dict[token] + + idy = idx + 1 + while idy < doc_array_len: + inner_token = doc_array[idy] + if self._terminal in current_dict_continued: + ent_ids = current_dict_continued[self._terminal] + for ent_id in ent_ids: + matches.append((self.vocab.strings[ent_id], start, idy)) + if inner_token in current_dict_continued: + current_dict_continued = current_dict_continued[inner_token] + else: + break + idy += 1 + else: + # end of doc_array reached + if self._terminal in current_dict_continued: + ent_ids = current_dict_continued[self._terminal] + for ent_id in ent_ids: + matches.append((self.vocab.strings[ent_id], start, idy)) + current_dict = self.keyword_trie_dict + reset_current_dict = True + else: + # we reset current_dict + current_dict = self.keyword_trie_dict + reset_current_dict = True + # if we are end of doc_array and have a sequence discovered + if idx + 1 >= doc_array_len: + if self._terminal in current_dict: + ent_ids = current_dict[self._terminal] + for ent_id in ent_ids: + matches.append((self.vocab.strings[ent_id], start, doc_array_len)) + idx += 1 + if reset_current_dict: + reset_current_dict = False + start = idx for i, (ent_id, start, end) in enumerate(matches): on_match = self._callbacks.get(ent_id) if on_match is not None: @@ -228,19 +263,6 @@ cdef class PhraseMatcher: else: yield doc - def accept_match(self, Doc doc, int start, int end): - cdef int i, j - cdef Pool mem = Pool() - phrase_key = mem.alloc(end-start, sizeof(attr_t)) - for i, j in enumerate(range(start, end)): - phrase_key[i] = doc.c[j].lex.orth - cdef hash_t key = hash64(phrase_key, (end-start) * sizeof(attr_t), 0) - - ent_index = self.phrase_ids.get(key) - if ent_index == 0: - return None - return self.ent_id_matrix[ent_index] - def get_lex_value(self, Doc doc, int i): if self.attr == ORTH: # Return the regular orth value of the lexeme @@ -256,25 +278,10 @@ cdef class PhraseMatcher: # Concatenate the attr name and value to not pollute lexeme space # e.g. 'POS-VERB' instead of just 'VERB', which could otherwise # create false positive matches - return "matcher:{}-{}".format(string_attr_name, string_attr_value) + matcher_attr_string = "matcher:{}-{}".format(string_attr_name, string_attr_value) + # Add new string to vocab + _ = self.vocab[matcher_attr_string] + return self.vocab.strings[matcher_attr_string] - -def get_biluo(length): - if length == 0: - raise ValueError(Errors.E127) - elif length == 1: - return [U_ENT] - elif length == 2: - return [B2_ENT, L2_ENT] - elif length == 3: - return [B3_ENT, I3_ENT, L3_ENT] - else: - return [B4_ENT, I4_ENT] + [I4_ENT] * (length-3) + [L4_ENT] - - -def unpickle_matcher(vocab, docs, callbacks): - matcher = PhraseMatcher(vocab) - for key, specs in docs.items(): - callback = callbacks.get(key, None) - matcher.add(key, callback, *specs) - return matcher + def _convert_to_array(self, Doc doc): + return np.array([self.get_lex_value(doc, i) for i in range(len(doc))], dtype=np.uint64) diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index b82f9a058..a9f5ac990 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -31,6 +31,58 @@ def test_phrase_matcher_contains(en_vocab): assert "TEST2" not in matcher +def test_phrase_matcher_repeated_add(en_vocab): + matcher = PhraseMatcher(en_vocab) + # match ID only gets added once + matcher.add("TEST", None, Doc(en_vocab, words=["like"])) + matcher.add("TEST", None, Doc(en_vocab, words=["like"])) + matcher.add("TEST", None, Doc(en_vocab, words=["like"])) + matcher.add("TEST", None, Doc(en_vocab, words=["like"])) + doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"]) + assert "TEST" in matcher + assert "TEST2" not in matcher + assert len(matcher(doc)) == 1 + + +def test_phrase_matcher_remove(en_vocab): + matcher = PhraseMatcher(en_vocab) + matcher.add("TEST", None, Doc(en_vocab, words=["like"])) + doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"]) + assert "TEST" in matcher + assert "TEST2" not in matcher + assert len(matcher(doc)) == 1 + matcher.remove("TEST") + assert "TEST" not in matcher + assert "TEST2" not in matcher + assert len(matcher(doc)) == 0 + matcher.remove("TEST2") + assert "TEST" not in matcher + assert "TEST2" not in matcher + assert len(matcher(doc)) == 0 + + +def test_phrase_matcher_overlapping_with_remove(en_vocab): + matcher = PhraseMatcher(en_vocab) + matcher.add("TEST", None, Doc(en_vocab, words=["like"])) + # TEST2 is added alongside TEST + matcher.add("TEST2", None, Doc(en_vocab, words=["like"])) + doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"]) + assert "TEST" in matcher + assert len(matcher) == 2 + assert len(matcher(doc)) == 2 + # removing TEST does not remove the entry for TEST2 + matcher.remove("TEST") + assert "TEST" not in matcher + assert len(matcher) == 1 + assert len(matcher(doc)) == 1 + assert matcher(doc)[0][0] == en_vocab.strings["TEST2"] + # removing TEST2 removes all + matcher.remove("TEST2") + assert "TEST2" not in matcher + assert len(matcher) == 0 + assert len(matcher(doc)) == 0 + + def test_phrase_matcher_string_attrs(en_vocab): words1 = ["I", "like", "cats"] pos1 = ["PRON", "VERB", "NOUN"] From 0d851db6d9f7cc7630718091e64d227ce3e4bd89 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 19 Sep 2019 20:20:53 +0200 Subject: [PATCH 02/12] Restore support for pickling --- spacy/matcher/phrasematcher.pyx | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 76a66c506..67f4ac83d 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -30,6 +30,7 @@ cdef class PhraseMatcher: cdef attr_id_t attr cdef object _callbacks cdef object _keywords + cdef object _docs cdef bint _validate def __init__(self, Vocab vocab, max_length=0, attr="ORTH", validate=False): @@ -49,6 +50,7 @@ cdef class PhraseMatcher: self.keyword_trie_dict = dict() self._callbacks = {} self._keywords = {} + self._docs = {} self._validate = validate if isinstance(attr, long): @@ -80,6 +82,10 @@ cdef class PhraseMatcher: """ return key in self._callbacks + def __reduce__(self): + data = (self.vocab, self._docs, self._callbacks) + return (unpickle_matcher, data, None, None) + def remove(self, key): """Remove a match-rule from the matcher by match ID. @@ -120,6 +126,7 @@ cdef class PhraseMatcher: del self._keywords[key] del self._callbacks[key] + del self._docs[key] def add(self, key, on_match, *docs): """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID @@ -135,6 +142,8 @@ cdef class PhraseMatcher: _ = self.vocab[key] self._callbacks[key] = on_match self._keywords.setdefault(key, []) + self._docs.setdefault(key, set()) + self._docs[key].update(docs) for doc in docs: if len(doc) == 0: @@ -285,3 +294,11 @@ cdef class PhraseMatcher: def _convert_to_array(self, Doc doc): return np.array([self.get_lex_value(doc, i) for i in range(len(doc))], dtype=np.uint64) + + +def unpickle_matcher(vocab, docs, callbacks): + matcher = PhraseMatcher(vocab) + for key, specs in docs.items(): + callback = callbacks.get(key, None) + matcher.add(key, callback, *specs) + return matcher From 3a4e1f5ca7ff550e90d0c309d8673a296e05e41c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 20 Sep 2019 09:18:38 +0200 Subject: [PATCH 03/12] Fix internal keyword add/remove for numpy arrays --- spacy/matcher/phrasematcher.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 67f4ac83d..ef55b0e68 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -122,7 +122,8 @@ cdef class PhraseMatcher: break # otherwise simply remove the key else: - current_dict[self._terminal].remove(key) + if key in current_dict[self._terminal]: + current_dict[self._terminal].remove(key) del self._keywords[key] del self._callbacks[key] @@ -159,8 +160,7 @@ cdef class PhraseMatcher: keyword = self._convert_to_array(doc) # keep track of keywords per key to make remove easier # (would use a set, but can't hash numpy arrays) - if keyword not in self._keywords[key]: - self._keywords[key].append(keyword) + self._keywords[key].append(keyword) current_dict = self.keyword_trie_dict for token in keyword: current_dict = current_dict.setdefault(token, {}) From c38c33058585a9de3be001d499b210b56f4d1a85 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sat, 21 Sep 2019 15:57:38 +0200 Subject: [PATCH 04/12] Add missing loop for match ID set in search loop --- spacy/matcher/phrasematcher.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index ef55b0e68..375b25b46 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -192,8 +192,9 @@ cdef class PhraseMatcher: # if end is present in current_dict if self._terminal in current_dict or token in current_dict: if self._terminal in current_dict: - ent_id = current_dict[self._terminal] - matches.append((self.vocab.strings[ent_id], start, idx)) + ent_ids = current_dict[self._terminal] + for ent_id in ent_ids: + matches.append((self.vocab.strings[ent_id], start, idx)) # look for longer sequences from this position if token in current_dict: From a7e9c0fd3e597cae890feecfe09c8a5f3c6a42a8 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 23 Sep 2019 09:11:13 +0200 Subject: [PATCH 05/12] Remove cruft in matching loop for partial matches There was a bit of unnecessary code left over from FlashText in the matching loop to handle partial token matches, which we don't have with PhraseMatcher. --- spacy/matcher/phrasematcher.pyx | 64 +++++++--------------- spacy/tests/matcher/test_phrase_matcher.py | 27 ++++++++- 2 files changed, 45 insertions(+), 46 deletions(-) diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 375b25b46..048b5bd68 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -184,56 +184,34 @@ cdef class PhraseMatcher: return matches current_dict = self.keyword_trie_dict start = 0 - reset_current_dict = False idx = 0 doc_array_len = len(doc_array) while idx < doc_array_len: + start = idx token = doc_array[idx] - # if end is present in current_dict - if self._terminal in current_dict or token in current_dict: - if self._terminal in current_dict: - ent_ids = current_dict[self._terminal] - for ent_id in ent_ids: - matches.append((self.vocab.strings[ent_id], start, idx)) - - # look for longer sequences from this position - if token in current_dict: - current_dict_continued = current_dict[token] - - idy = idx + 1 - while idy < doc_array_len: - inner_token = doc_array[idy] - if self._terminal in current_dict_continued: - ent_ids = current_dict_continued[self._terminal] - for ent_id in ent_ids: - matches.append((self.vocab.strings[ent_id], start, idy)) - if inner_token in current_dict_continued: - current_dict_continued = current_dict_continued[inner_token] - else: - break + # look for sequences from this position + if token in current_dict: + current_dict_continued = current_dict[token] + idy = idx + 1 + while idy < doc_array_len: + if self._terminal in current_dict_continued: + ent_ids = current_dict_continued[self._terminal] + for ent_id in ent_ids: + matches.append((self.vocab.strings[ent_id], start, idy)) + inner_token = doc_array[idy] + if inner_token in current_dict_continued: + current_dict_continued = current_dict_continued[inner_token] idy += 1 else: - # end of doc_array reached - if self._terminal in current_dict_continued: - ent_ids = current_dict_continued[self._terminal] - for ent_id in ent_ids: - matches.append((self.vocab.strings[ent_id], start, idy)) - current_dict = self.keyword_trie_dict - reset_current_dict = True - else: - # we reset current_dict - current_dict = self.keyword_trie_dict - reset_current_dict = True - # if we are end of doc_array and have a sequence discovered - if idx + 1 >= doc_array_len: - if self._terminal in current_dict: - ent_ids = current_dict[self._terminal] - for ent_id in ent_ids: - matches.append((self.vocab.strings[ent_id], start, doc_array_len)) + break + else: + # end of doc_array reached + if self._terminal in current_dict_continued: + ent_ids = current_dict_continued[self._terminal] + for ent_id in ent_ids: + matches.append((self.vocab.strings[ent_id], start, idy)) + current_dict = self.keyword_trie_dict idx += 1 - if reset_current_dict: - reset_current_dict = False - start = idx for i, (ent_id, start, end) in enumerate(matches): on_match = self._callbacks.get(ent_id) if on_match is not None: diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index a9f5ac990..3eb454c39 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -8,10 +8,31 @@ from ..util import get_doc def test_matcher_phrase_matcher(en_vocab): - doc = Doc(en_vocab, words=["Google", "Now"]) - matcher = PhraseMatcher(en_vocab) - matcher.add("COMPANY", None, doc) doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"]) + # intermediate phrase + pattern = Doc(en_vocab, words=["Google", "Now"]) + matcher = PhraseMatcher(en_vocab) + matcher.add("COMPANY", None, pattern) + assert len(matcher(doc)) == 1 + # initial token + pattern = Doc(en_vocab, words=["I"]) + matcher = PhraseMatcher(en_vocab) + matcher.add("I", None, pattern) + assert len(matcher(doc)) == 1 + # initial phrase + pattern = Doc(en_vocab, words=["I", "like"]) + matcher = PhraseMatcher(en_vocab) + matcher.add("ILIKE", None, pattern) + assert len(matcher(doc)) == 1 + # final token + pattern = Doc(en_vocab, words=["best"]) + matcher = PhraseMatcher(en_vocab) + matcher.add("BEST", None, pattern) + assert len(matcher(doc)) == 1 + # final phrase + pattern = Doc(en_vocab, words=["Now", "best"]) + matcher = PhraseMatcher(en_vocab) + matcher.add("NOWBEST", None, pattern) assert len(matcher(doc)) == 1 From 39540ed1ce95e98a04daa26209dd6342f308c740 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 24 Sep 2019 14:39:50 +0200 Subject: [PATCH 06/12] Replace dict trie with MapStruct trie --- spacy/matcher/phrasematcher.pxd | 6 + spacy/matcher/phrasematcher.pyx | 190 +++++++++++++++------ spacy/tests/matcher/test_phrase_matcher.py | 25 ++- 3 files changed, 159 insertions(+), 62 deletions(-) diff --git a/spacy/matcher/phrasematcher.pxd b/spacy/matcher/phrasematcher.pxd index e69de29bb..1a550989d 100644 --- a/spacy/matcher/phrasematcher.pxd +++ b/spacy/matcher/phrasematcher.pxd @@ -0,0 +1,6 @@ +from preshed.maps cimport key_t + +cdef struct MatchStruct: + key_t match_id + int start + int end diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 048b5bd68..172a271b0 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -2,10 +2,20 @@ # cython: profile=True from __future__ import unicode_literals +from libc.stdint cimport uintptr_t +from libc.stdio cimport printf +from libcpp.vector cimport vector + +from cymem.cymem cimport Pool + +from preshed.maps cimport MapStruct, map_init, map_set, map_get_unless_missing +from preshed.maps cimport map_clear, map_iter, key_t, Result + import numpy as np from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, attr_id_t from ..vocab cimport Vocab +from ..strings cimport hash_string from ..tokens.doc cimport Doc, get_token_attr from ._schemas import TOKEN_PATTERN_SCHEMA @@ -25,14 +35,18 @@ cdef class PhraseMatcher: Copyright (c) 2017 Vikash Singh (vikash.duliajan@gmail.com) """ cdef Vocab vocab - cdef unicode _terminal - cdef object keyword_trie_dict cdef attr_id_t attr cdef object _callbacks cdef object _keywords cdef object _docs cdef bint _validate + cdef MapStruct* c_map + cdef Pool mem + cdef key_t _terminal_node + + cdef void find_matches(self, key_t* hash_array, int hash_array_len, vector[MatchStruct] *matches) nogil + def __init__(self, Vocab vocab, max_length=0, attr="ORTH", validate=False): """Initialize the PhraseMatcher. @@ -46,13 +60,16 @@ cdef class PhraseMatcher: if max_length != 0: deprecation_warning(Warnings.W010) self.vocab = vocab - self._terminal = '_terminal_' - self.keyword_trie_dict = dict() self._callbacks = {} self._keywords = {} self._docs = {} self._validate = validate + self.mem = Pool() + self.c_map = self.mem.alloc(1, sizeof(MapStruct)) + self._terminal_node = 1 # or random: np.random.randint(0, high=np.iinfo(np.uint64).max, dtype=np.uint64) + map_init(self.mem, self.c_map, 8) + if isinstance(attr, long): self.attr = attr else: @@ -93,37 +110,58 @@ cdef class PhraseMatcher: """ if key not in self._keywords: return + cdef MapStruct* current_node + cdef MapStruct* terminal_map + cdef MapStruct* node_pointer + cdef Result result + cdef key_t terminal_key + cdef void* value + cdef int c_i = 0 for keyword in self._keywords[key]: - current_dict = self.keyword_trie_dict + current_node = self.c_map token_trie_list = [] - for tokens in keyword: - if tokens in current_dict: - token_trie_list.append((tokens, current_dict)) - current_dict = current_dict[tokens] + for token in keyword: + result = map_get_unless_missing(current_node, token) + if result.found: + token_trie_list.append((token, current_node)) + current_node = result.value else: # if token is not found, break out of the loop - current_dict = None + current_node = NULL break - # remove the tokens from trie dict if there are no other + # remove the tokens from trie node if there are no other # keywords with them - if current_dict and self._terminal in current_dict: + result = map_get_unless_missing(current_node, self._terminal_node) + if current_node != NULL and result.found: # if this is the only remaining key, remove unnecessary paths - if current_dict[self._terminal] == [key]: + terminal_map = result.value + terminal_keys = [] + c_i = 0 + while map_iter(terminal_map, &c_i, &terminal_key, &value): + terminal_keys.append(self.vocab.strings[terminal_key]) + # TODO: not working, fix remove for unused paths/maps + if False and terminal_keys == [key]: # we found a complete match for input keyword - token_trie_list.append((self._terminal, current_dict)) + token_trie_list.append((self.vocab.strings[key], terminal_map)) token_trie_list.reverse() - for key_to_remove, dict_pointer in token_trie_list: - if len(dict_pointer.keys()) == 1: - dict_pointer.pop(key_to_remove) + for key_to_remove, py_node_pointer in token_trie_list: + node_pointer = py_node_pointer + result = map_get_unless_missing(node_pointer, key_to_remove) + if node_pointer.filled == 1: + map_clear(node_pointer, key_to_remove) + self.mem.free(result.value) + pass else: # more than one key means more than 1 path, # delete not required path and keep the other - dict_pointer.pop(key_to_remove) + map_clear(node_pointer, key_to_remove) + self.mem.free(result.value) break # otherwise simply remove the key else: - if key in current_dict[self._terminal]: - current_dict[self._terminal].remove(key) + result = map_get_unless_missing(current_node, self._terminal_node) + if result.found: + map_clear(result.value, self.vocab.strings[key]) del self._keywords[key] del self._callbacks[key] @@ -146,6 +184,10 @@ cdef class PhraseMatcher: self._docs.setdefault(key, set()) self._docs[key].update(docs) + cdef MapStruct* current_node + cdef MapStruct* internal_node + cdef Result result + for doc in docs: if len(doc) == 0: continue @@ -161,11 +203,23 @@ cdef class PhraseMatcher: # keep track of keywords per key to make remove easier # (would use a set, but can't hash numpy arrays) self._keywords[key].append(keyword) - current_dict = self.keyword_trie_dict + + current_node = self.c_map for token in keyword: - current_dict = current_dict.setdefault(token, {}) - current_dict.setdefault(self._terminal, set()) - current_dict[self._terminal].add(key) + result = map_get_unless_missing(current_node, token) + if not result.found: + internal_node = self.mem.alloc(1, sizeof(MapStruct)) + map_init(self.mem, internal_node, 8) + map_set(self.mem, current_node, token, internal_node) + result.value = internal_node + current_node = result.value + result = map_get_unless_missing(current_node, self._terminal_node) + if not result.found: + internal_node = self.mem.alloc(1, sizeof(MapStruct)) + map_init(self.mem, internal_node, 8) + map_set(self.mem, current_node, self._terminal_node, internal_node) + result.value = internal_node + map_set(self.mem, result.value, hash_string(key), NULL) def __call__(self, doc): """Find all sequences matching the supplied patterns on the `Doc`. @@ -182,42 +236,62 @@ cdef class PhraseMatcher: if doc_array is None or len(doc_array) == 0: # if doc_array is empty or None just return empty list return matches - current_dict = self.keyword_trie_dict - start = 0 - idx = 0 - doc_array_len = len(doc_array) - while idx < doc_array_len: - start = idx - token = doc_array[idx] - # look for sequences from this position - if token in current_dict: - current_dict_continued = current_dict[token] - idy = idx + 1 - while idy < doc_array_len: - if self._terminal in current_dict_continued: - ent_ids = current_dict_continued[self._terminal] - for ent_id in ent_ids: - matches.append((self.vocab.strings[ent_id], start, idy)) - inner_token = doc_array[idy] - if inner_token in current_dict_continued: - current_dict_continued = current_dict_continued[inner_token] - idy += 1 - else: - break - else: - # end of doc_array reached - if self._terminal in current_dict_continued: - ent_ids = current_dict_continued[self._terminal] - for ent_id in ent_ids: - matches.append((self.vocab.strings[ent_id], start, idy)) - current_dict = self.keyword_trie_dict - idx += 1 + + if not doc_array.flags['C_CONTIGUOUS']: + doc_array = np.ascontiguousarray(doc_array) + cdef key_t[::1] doc_array_memview = doc_array + cdef vector[MatchStruct] c_matches + self.find_matches(&doc_array_memview[0], doc_array_memview.shape[0], &c_matches) + for i in range(c_matches.size()): + matches.append((c_matches[i].match_id, c_matches[i].start, c_matches[i].end)) for i, (ent_id, start, end) in enumerate(matches): on_match = self._callbacks.get(ent_id) if on_match is not None: on_match(self, doc, i, matches) return matches + cdef void find_matches(self, key_t* hash_array, int hash_array_len, vector[MatchStruct] *matches) nogil: + cdef MapStruct* current_node = self.c_map + cdef int start = 0 + cdef int idx = 0 + cdef int idy = 0 + cdef key_t key + cdef void* value + cdef int i = 0 + cdef MatchStruct ms + while idx < hash_array_len: + start = idx + token = hash_array[idx] + # look for sequences from this position + result = map_get_unless_missing(current_node, token) + if result.found: + current_node = result.value + idy = idx + 1 + while idy < hash_array_len: + result = map_get_unless_missing(current_node, self._terminal_node) + if result.found: + i = 0 + while map_iter(result.value, &i, &key, &value): + ms = make_matchstruct(key, start, idy) + matches.push_back(ms) + inner_token = hash_array[idy] + result = map_get_unless_missing(current_node, inner_token) + if result.found: + current_node = result.value + idy += 1 + else: + break + else: + # end of hash_array reached + result = map_get_unless_missing(current_node, self._terminal_node) + if result.found: + i = 0 + while map_iter(result.value, &i, &key, &value): + ms = make_matchstruct(key, start, idy) + matches.push_back(ms) + current_node = self.c_map + idx += 1 + def pipe(self, stream, batch_size=1000, n_threads=-1, return_matches=False, as_tuples=False): """Match a stream of documents, yielding them in turn. @@ -281,3 +355,11 @@ def unpickle_matcher(vocab, docs, callbacks): callback = callbacks.get(key, None) matcher.add(key, callback, *specs) return matcher + + +cdef MatchStruct make_matchstruct(key_t match_id, int start, int end) nogil: + cdef MatchStruct ms + ms.match_id = match_id + ms.start = start + ms.end = end + return ms diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 3eb454c39..7d65d0007 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -67,18 +67,27 @@ def test_phrase_matcher_repeated_add(en_vocab): def test_phrase_matcher_remove(en_vocab): matcher = PhraseMatcher(en_vocab) - matcher.add("TEST", None, Doc(en_vocab, words=["like"])) + matcher.add("TEST1", None, Doc(en_vocab, words=["like"])) + matcher.add("TEST2", None, Doc(en_vocab, words=["best"])) doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"]) - assert "TEST" in matcher - assert "TEST2" not in matcher + assert "TEST1" in matcher + assert "TEST2" in matcher + assert "TEST3" not in matcher + assert len(matcher(doc)) == 2 + matcher.remove("TEST1") + assert "TEST1" not in matcher + assert "TEST2" in matcher + assert "TEST3" not in matcher assert len(matcher(doc)) == 1 - matcher.remove("TEST") - assert "TEST" not in matcher - assert "TEST2" not in matcher - assert len(matcher(doc)) == 0 matcher.remove("TEST2") - assert "TEST" not in matcher + assert "TEST1" not in matcher assert "TEST2" not in matcher + assert "TEST3" not in matcher + assert len(matcher(doc)) == 0 + matcher.remove("TEST3") + assert "TEST1" not in matcher + assert "TEST2" not in matcher + assert "TEST3" not in matcher assert len(matcher(doc)) == 0 From d4141302b6ba8f1a7b4cb178e22a01914720456c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 24 Sep 2019 15:36:26 +0200 Subject: [PATCH 07/12] Fix how match ID hash is stored/added --- spacy/matcher/phrasematcher.pyx | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 172a271b0..940ebb60e 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -15,7 +15,6 @@ import numpy as np from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, attr_id_t from ..vocab cimport Vocab -from ..strings cimport hash_string from ..tokens.doc cimport Doc, get_token_attr from ._schemas import TOKEN_PATTERN_SCHEMA @@ -178,7 +177,7 @@ cdef class PhraseMatcher: DOCS: https://spacy.io/api/phrasematcher#add """ - _ = self.vocab[key] + _ = self.vocab.strings[key] self._callbacks[key] = on_match self._keywords.setdefault(key, []) self._docs.setdefault(key, set()) @@ -219,7 +218,7 @@ cdef class PhraseMatcher: map_init(self.mem, internal_node, 8) map_set(self.mem, current_node, self._terminal_node, internal_node) result.value = internal_node - map_set(self.mem, result.value, hash_string(key), NULL) + map_set(self.mem, result.value, self.vocab.strings[key], NULL) def __call__(self, doc): """Find all sequences matching the supplied patterns on the `Doc`. From 34550ef66218a7e6c39e3349ea51930b09f103d0 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 24 Sep 2019 16:07:38 +0200 Subject: [PATCH 08/12] Update fix for match ID vocab --- spacy/matcher/phrasematcher.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 940ebb60e..05231ffae 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -177,7 +177,7 @@ cdef class PhraseMatcher: DOCS: https://spacy.io/api/phrasematcher#add """ - _ = self.vocab.strings[key] + _ = self.vocab[key] self._callbacks[key] = on_match self._keywords.setdefault(key, []) self._docs.setdefault(key, set()) From d995a7849e4c15f9cbc470410b543609f54b3a68 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 24 Sep 2019 16:20:24 +0200 Subject: [PATCH 09/12] Switch from map_get_unless_missing to map_get --- spacy/matcher/phrasematcher.pyx | 73 +++++++++++++++++---------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 05231ffae..d67cda005 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -8,8 +8,8 @@ from libcpp.vector cimport vector from cymem.cymem cimport Pool -from preshed.maps cimport MapStruct, map_init, map_set, map_get_unless_missing -from preshed.maps cimport map_clear, map_iter, key_t, Result +from preshed.maps cimport MapStruct, map_init, map_set, map_get, map_clear +from preshed.maps cimport map_iter, key_t import numpy as np @@ -112,7 +112,7 @@ cdef class PhraseMatcher: cdef MapStruct* current_node cdef MapStruct* terminal_map cdef MapStruct* node_pointer - cdef Result result + cdef void* result cdef key_t terminal_key cdef void* value cdef int c_i = 0 @@ -120,20 +120,20 @@ cdef class PhraseMatcher: current_node = self.c_map token_trie_list = [] for token in keyword: - result = map_get_unless_missing(current_node, token) - if result.found: + result = map_get(current_node, token) + if result: token_trie_list.append((token, current_node)) - current_node = result.value + current_node = result else: # if token is not found, break out of the loop current_node = NULL break # remove the tokens from trie node if there are no other # keywords with them - result = map_get_unless_missing(current_node, self._terminal_node) - if current_node != NULL and result.found: + result = map_get(current_node, self._terminal_node) + if current_node != NULL and result: # if this is the only remaining key, remove unnecessary paths - terminal_map = result.value + terminal_map = result terminal_keys = [] c_i = 0 while map_iter(terminal_map, &c_i, &terminal_key, &value): @@ -145,22 +145,22 @@ cdef class PhraseMatcher: token_trie_list.reverse() for key_to_remove, py_node_pointer in token_trie_list: node_pointer = py_node_pointer - result = map_get_unless_missing(node_pointer, key_to_remove) + result = map_get(node_pointer, key_to_remove) if node_pointer.filled == 1: map_clear(node_pointer, key_to_remove) - self.mem.free(result.value) + self.mem.free(result) pass else: # more than one key means more than 1 path, # delete not required path and keep the other map_clear(node_pointer, key_to_remove) - self.mem.free(result.value) + self.mem.free(result) break # otherwise simply remove the key else: - result = map_get_unless_missing(current_node, self._terminal_node) - if result.found: - map_clear(result.value, self.vocab.strings[key]) + result = map_get(current_node, self._terminal_node) + if result: + map_clear(result, self.vocab.strings[key]) del self._keywords[key] del self._callbacks[key] @@ -185,7 +185,7 @@ cdef class PhraseMatcher: cdef MapStruct* current_node cdef MapStruct* internal_node - cdef Result result + cdef void* result for doc in docs: if len(doc) == 0: @@ -205,20 +205,20 @@ cdef class PhraseMatcher: current_node = self.c_map for token in keyword: - result = map_get_unless_missing(current_node, token) - if not result.found: + result = map_get(current_node, token) + if not result: internal_node = self.mem.alloc(1, sizeof(MapStruct)) map_init(self.mem, internal_node, 8) map_set(self.mem, current_node, token, internal_node) - result.value = internal_node - current_node = result.value - result = map_get_unless_missing(current_node, self._terminal_node) - if not result.found: + result = internal_node + current_node = result + result = map_get(current_node, self._terminal_node) + if not result: internal_node = self.mem.alloc(1, sizeof(MapStruct)) map_init(self.mem, internal_node, 8) map_set(self.mem, current_node, self._terminal_node, internal_node) - result.value = internal_node - map_set(self.mem, result.value, self.vocab.strings[key], NULL) + result = internal_node + map_set(self.mem, result, self.vocab.strings[key], NULL) def __call__(self, doc): """Find all sequences matching the supplied patterns on the `Doc`. @@ -258,34 +258,35 @@ cdef class PhraseMatcher: cdef void* value cdef int i = 0 cdef MatchStruct ms + cdef void* result while idx < hash_array_len: start = idx token = hash_array[idx] # look for sequences from this position - result = map_get_unless_missing(current_node, token) - if result.found: - current_node = result.value + result = map_get(current_node, token) + if result: + current_node = result idy = idx + 1 while idy < hash_array_len: - result = map_get_unless_missing(current_node, self._terminal_node) - if result.found: + result = map_get(current_node, self._terminal_node) + if result: i = 0 - while map_iter(result.value, &i, &key, &value): + while map_iter(result, &i, &key, &value): ms = make_matchstruct(key, start, idy) matches.push_back(ms) inner_token = hash_array[idy] - result = map_get_unless_missing(current_node, inner_token) - if result.found: - current_node = result.value + result = map_get(current_node, inner_token) + if result: + current_node = result idy += 1 else: break else: # end of hash_array reached - result = map_get_unless_missing(current_node, self._terminal_node) - if result.found: + result = map_get(current_node, self._terminal_node) + if result: i = 0 - while map_iter(result.value, &i, &key, &value): + while map_iter(result, &i, &key, &value): ms = make_matchstruct(key, start, idy) matches.push_back(ms) current_node = self.c_map From 3c6f1d7e3afb4b702b54109d400c6fc6bfa78f24 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 25 Sep 2019 09:41:27 +0200 Subject: [PATCH 10/12] Switch from numpy array to Token.get_struct_attr Access token attributes directly in Doc instead of making a copy of the relevant values in a numpy array. Add unsatisfactory warning for hash collision with reserved terminal hash key. (Ideally it would change the reserved terminal hash and redo the whole trie, but for now, I'm hoping there won't be collisions.) --- spacy/errors.py | 2 + spacy/matcher/phrasematcher.pyx | 69 ++++++++++++--------------------- 2 files changed, 26 insertions(+), 45 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index b03bd6d23..da6694bb7 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -86,6 +86,8 @@ class Warnings(object): "previously loaded vectors. See Issue #3853.") W020 = ("Unnamed vectors. This won't allow multiple vectors models to be " "loaded. (Shape: {shape})") + W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be " + "incorrect. Modify PhraseMatcher._terminal_hash to fix.") @add_codes diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index d67cda005..aedb16dcc 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -11,10 +11,10 @@ from cymem.cymem cimport Pool from preshed.maps cimport MapStruct, map_init, map_set, map_get, map_clear from preshed.maps cimport map_iter, key_t -import numpy as np - from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, attr_id_t from ..vocab cimport Vocab +from ..structs cimport TokenC +from ..tokens.token cimport Token from ..tokens.doc cimport Doc, get_token_attr from ._schemas import TOKEN_PATTERN_SCHEMA @@ -42,9 +42,9 @@ cdef class PhraseMatcher: cdef MapStruct* c_map cdef Pool mem - cdef key_t _terminal_node + cdef key_t _terminal_hash - cdef void find_matches(self, key_t* hash_array, int hash_array_len, vector[MatchStruct] *matches) nogil + cdef void find_matches(self, Doc doc, vector[MatchStruct] *matches) nogil def __init__(self, Vocab vocab, max_length=0, attr="ORTH", validate=False): """Initialize the PhraseMatcher. @@ -66,7 +66,7 @@ cdef class PhraseMatcher: self.mem = Pool() self.c_map = self.mem.alloc(1, sizeof(MapStruct)) - self._terminal_node = 1 # or random: np.random.randint(0, high=np.iinfo(np.uint64).max, dtype=np.uint64) + self._terminal_hash = 826361138722620965 map_init(self.mem, self.c_map, 8) if isinstance(attr, long): @@ -130,7 +130,7 @@ cdef class PhraseMatcher: break # remove the tokens from trie node if there are no other # keywords with them - result = map_get(current_node, self._terminal_node) + result = map_get(current_node, self._terminal_hash) if current_node != NULL and result: # if this is the only remaining key, remove unnecessary paths terminal_map = result @@ -158,7 +158,7 @@ cdef class PhraseMatcher: break # otherwise simply remove the key else: - result = map_get(current_node, self._terminal_node) + result = map_get(current_node, self._terminal_hash) if result: map_clear(result, self.vocab.strings[key]) @@ -205,6 +205,9 @@ cdef class PhraseMatcher: current_node = self.c_map for token in keyword: + if token == self._terminal_hash: + user_warning(Warnings.W021) + break result = map_get(current_node, token) if not result: internal_node = self.mem.alloc(1, sizeof(MapStruct)) @@ -212,11 +215,11 @@ cdef class PhraseMatcher: map_set(self.mem, current_node, token, internal_node) result = internal_node current_node = result - result = map_get(current_node, self._terminal_node) + result = map_get(current_node, self._terminal_hash) if not result: internal_node = self.mem.alloc(1, sizeof(MapStruct)) map_init(self.mem, internal_node, 8) - map_set(self.mem, current_node, self._terminal_node, internal_node) + map_set(self.mem, current_node, self._terminal_hash, internal_node) result = internal_node map_set(self.mem, result, self.vocab.strings[key], NULL) @@ -230,17 +233,13 @@ cdef class PhraseMatcher: DOCS: https://spacy.io/api/phrasematcher#call """ - doc_array = self._convert_to_array(doc) matches = [] - if doc_array is None or len(doc_array) == 0: - # if doc_array is empty or None just return empty list + if doc is None or len(doc) == 0: + # if doc is empty or None just return empty list return matches - if not doc_array.flags['C_CONTIGUOUS']: - doc_array = np.ascontiguousarray(doc_array) - cdef key_t[::1] doc_array_memview = doc_array cdef vector[MatchStruct] c_matches - self.find_matches(&doc_array_memview[0], doc_array_memview.shape[0], &c_matches) + self.find_matches(doc, &c_matches) for i in range(c_matches.size()): matches.append((c_matches[i].match_id, c_matches[i].start, c_matches[i].end)) for i, (ent_id, start, end) in enumerate(matches): @@ -249,7 +248,7 @@ cdef class PhraseMatcher: on_match(self, doc, i, matches) return matches - cdef void find_matches(self, key_t* hash_array, int hash_array_len, vector[MatchStruct] *matches) nogil: + cdef void find_matches(self, Doc doc, vector[MatchStruct] *matches) nogil: cdef MapStruct* current_node = self.c_map cdef int start = 0 cdef int idx = 0 @@ -259,22 +258,22 @@ cdef class PhraseMatcher: cdef int i = 0 cdef MatchStruct ms cdef void* result - while idx < hash_array_len: + while idx < doc.length: start = idx - token = hash_array[idx] + token = Token.get_struct_attr(&doc.c[idx], self.attr) # look for sequences from this position result = map_get(current_node, token) if result: current_node = result idy = idx + 1 - while idy < hash_array_len: - result = map_get(current_node, self._terminal_node) + while idy < doc.length: + result = map_get(current_node, self._terminal_hash) if result: i = 0 while map_iter(result, &i, &key, &value): ms = make_matchstruct(key, start, idy) matches.push_back(ms) - inner_token = hash_array[idy] + inner_token = Token.get_struct_attr(&doc.c[idy], self.attr) result = map_get(current_node, inner_token) if result: current_node = result @@ -282,8 +281,8 @@ cdef class PhraseMatcher: else: break else: - # end of hash_array reached - result = map_get(current_node, self._terminal_node) + # end of doc reached + result = map_get(current_node, self._terminal_hash) if result: i = 0 while map_iter(result, &i, &key, &value): @@ -325,28 +324,8 @@ cdef class PhraseMatcher: else: yield doc - def get_lex_value(self, Doc doc, int i): - if self.attr == ORTH: - # Return the regular orth value of the lexeme - return doc.c[i].lex.orth - # Get the attribute value instead, e.g. token.pos - attr_value = get_token_attr(&doc.c[i], self.attr) - if attr_value in (0, 1): - # Value is boolean, convert to string - string_attr_value = str(attr_value) - else: - string_attr_value = self.vocab.strings[attr_value] - string_attr_name = self.vocab.strings[self.attr] - # Concatenate the attr name and value to not pollute lexeme space - # e.g. 'POS-VERB' instead of just 'VERB', which could otherwise - # create false positive matches - matcher_attr_string = "matcher:{}-{}".format(string_attr_name, string_attr_value) - # Add new string to vocab - _ = self.vocab[matcher_attr_string] - return self.vocab.strings[matcher_attr_string] - def _convert_to_array(self, Doc doc): - return np.array([self.get_lex_value(doc, i) for i in range(len(doc))], dtype=np.uint64) + return [Token.get_struct_attr(&doc.c[i], self.attr) for i in range(len(doc))] def unpickle_matcher(vocab, docs, callbacks): From 7862a6eb016ed10a24d9ea237816c5ae9157cf01 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 25 Sep 2019 11:03:58 +0200 Subject: [PATCH 11/12] Restructure imports to export find_matches --- spacy/matcher/phrasematcher.pxd | 24 +++++++++++++++++++++++- spacy/matcher/phrasematcher.pyx | 23 ++--------------------- 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/spacy/matcher/phrasematcher.pxd b/spacy/matcher/phrasematcher.pxd index 1a550989d..8695a8114 100644 --- a/spacy/matcher/phrasematcher.pxd +++ b/spacy/matcher/phrasematcher.pxd @@ -1,4 +1,26 @@ -from preshed.maps cimport key_t +from libcpp.vector cimport vector + +from cymem.cymem cimport Pool +from preshed.maps cimport key_t, MapStruct + +from ..attrs cimport attr_id_t +from ..tokens.doc cimport Doc +from ..vocab cimport Vocab + + +cdef class PhraseMatcher: + cdef Vocab vocab + cdef attr_id_t attr + cdef object _callbacks + cdef object _keywords + cdef object _docs + cdef bint _validate + cdef MapStruct* c_map + cdef Pool mem + cdef key_t _terminal_hash + + cdef void find_matches(self, Doc doc, vector[MatchStruct] *matches) nogil + cdef struct MatchStruct: key_t match_id diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index aedb16dcc..75881848a 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -3,19 +3,12 @@ from __future__ import unicode_literals from libc.stdint cimport uintptr_t -from libc.stdio cimport printf -from libcpp.vector cimport vector -from cymem.cymem cimport Pool +from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter -from preshed.maps cimport MapStruct, map_init, map_set, map_get, map_clear -from preshed.maps cimport map_iter, key_t - -from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, attr_id_t -from ..vocab cimport Vocab +from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA from ..structs cimport TokenC from ..tokens.token cimport Token -from ..tokens.doc cimport Doc, get_token_attr from ._schemas import TOKEN_PATTERN_SCHEMA from ..errors import Errors, Warnings, deprecation_warning, user_warning @@ -33,18 +26,6 @@ cdef class PhraseMatcher: MIT License (see `LICENSE`) Copyright (c) 2017 Vikash Singh (vikash.duliajan@gmail.com) """ - cdef Vocab vocab - cdef attr_id_t attr - cdef object _callbacks - cdef object _keywords - cdef object _docs - cdef bint _validate - - cdef MapStruct* c_map - cdef Pool mem - cdef key_t _terminal_hash - - cdef void find_matches(self, Doc doc, vector[MatchStruct] *matches) nogil def __init__(self, Vocab vocab, max_length=0, attr="ORTH", validate=False): """Initialize the PhraseMatcher. From 3fdb22d832c01faf179f8e597de10287d4d72fd9 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 26 Sep 2019 11:31:03 +0200 Subject: [PATCH 12/12] Implement full remove() Remove unnecessary trie paths and free unused maps. Parallel to Matcher, raise KeyError when attempting to remove a match ID that has not been added. --- spacy/matcher/phrasematcher.pyx | 30 ++++++++++++---------- spacy/tests/matcher/test_phrase_matcher.py | 3 ++- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 75881848a..68428d843 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -84,12 +84,13 @@ cdef class PhraseMatcher: return (unpickle_matcher, data, None, None) def remove(self, key): - """Remove a match-rule from the matcher by match ID. + """Remove a rule from the matcher by match ID. A KeyError is raised if + the key does not exist. key (unicode): The match ID. """ if key not in self._keywords: - return + raise KeyError(key) cdef MapStruct* current_node cdef MapStruct* terminal_map cdef MapStruct* node_pointer @@ -97,13 +98,16 @@ cdef class PhraseMatcher: cdef key_t terminal_key cdef void* value cdef int c_i = 0 + cdef vector[MapStruct*] path_nodes + cdef vector[key_t] path_keys + cdef key_t key_to_remove for keyword in self._keywords[key]: current_node = self.c_map - token_trie_list = [] for token in keyword: result = map_get(current_node, token) if result: - token_trie_list.append((token, current_node)) + path_nodes.push_back(current_node) + path_keys.push_back(token) current_node = result else: # if token is not found, break out of the loop @@ -113,27 +117,25 @@ cdef class PhraseMatcher: # keywords with them result = map_get(current_node, self._terminal_hash) if current_node != NULL and result: - # if this is the only remaining key, remove unnecessary paths terminal_map = result terminal_keys = [] c_i = 0 while map_iter(terminal_map, &c_i, &terminal_key, &value): terminal_keys.append(self.vocab.strings[terminal_key]) - # TODO: not working, fix remove for unused paths/maps - if False and terminal_keys == [key]: - # we found a complete match for input keyword - token_trie_list.append((self.vocab.strings[key], terminal_map)) - token_trie_list.reverse() - for key_to_remove, py_node_pointer in token_trie_list: - node_pointer = py_node_pointer + # if this is the only remaining key, remove unnecessary paths + if terminal_keys == [key]: + while not path_nodes.empty(): + node_pointer = path_nodes.back() + path_nodes.pop_back() + key_to_remove = path_keys.back() + path_keys.pop_back() result = map_get(node_pointer, key_to_remove) if node_pointer.filled == 1: map_clear(node_pointer, key_to_remove) self.mem.free(result) - pass else: # more than one key means more than 1 path, - # delete not required path and keep the other + # delete not required path and keep the others map_clear(node_pointer, key_to_remove) self.mem.free(result) break diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 7d65d0007..486cbb984 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -84,7 +84,8 @@ def test_phrase_matcher_remove(en_vocab): assert "TEST2" not in matcher assert "TEST3" not in matcher assert len(matcher(doc)) == 0 - matcher.remove("TEST3") + with pytest.raises(KeyError): + matcher.remove("TEST3") assert "TEST1" not in matcher assert "TEST2" not in matcher assert "TEST3" not in matcher