spaCy/spacy/tests/matcher/test_phrase_matcher.py

510 lines
18 KiB
Python
Raw Normal View History

import pytest
import warnings
import srsly
from mock import Mock
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher, Matcher
2020-08-31 15:53:22 +03:00
from spacy.tokens import Doc, Span
from spacy.vocab import Vocab
from ..util import make_tempdir
@pytest.mark.issue(3248)
def test_issue3248_1():
"""Test that the PhraseMatcher correctly reports its number of rules, not
total number of patterns."""
nlp = English()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
matcher.add("TEST2", [nlp("d")])
assert len(matcher) == 2
@pytest.mark.issue(3331)
def test_issue3331(en_vocab):
"""Test that duplicate patterns for different rules result in multiple
matches, one per rule.
"""
matcher = PhraseMatcher(en_vocab)
matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])])
matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])])
doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
matches = matcher(doc)
assert len(matches) == 2
match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]]
assert sorted(match_ids) == ["A", "B"]
@pytest.mark.issue(3972)
def test_issue3972(en_vocab):
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs."""
matcher = PhraseMatcher(en_vocab)
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
matches = matcher(doc)
assert len(matches) == 2
# We should have a match for each of the two rules
found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
assert "A" in found_ids
assert "B" in found_ids
@pytest.mark.issue(4002)
def test_issue4002(en_vocab):
"""Test that the PhraseMatcher can match on overwritten NORM attributes."""
matcher = PhraseMatcher(en_vocab, attr="NORM")
pattern1 = Doc(en_vocab, words=["c", "d"])
assert [t.norm_ for t in pattern1] == ["c", "d"]
matcher.add("TEST", [pattern1])
doc = Doc(en_vocab, words=["a", "b", "c", "d"])
assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
matches = matcher(doc)
assert len(matches) == 1
matcher = PhraseMatcher(en_vocab, attr="NORM")
pattern2 = Doc(en_vocab, words=["1", "2"])
pattern2[0].norm_ = "c"
pattern2[1].norm_ = "d"
assert [t.norm_ for t in pattern2] == ["c", "d"]
matcher.add("TEST", [pattern2])
matches = matcher(doc)
assert len(matches) == 1
@pytest.mark.issue(4373)
def test_issue4373():
"""Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
matcher = Matcher(Vocab())
assert isinstance(matcher.vocab, Vocab)
matcher = PhraseMatcher(Vocab())
assert isinstance(matcher.vocab, Vocab)
@pytest.mark.issue(4651)
def test_issue4651_with_phrase_matcher_attr():
"""Test that the EntityRuler PhraseMatcher is deserialized correctly using
the method from_disk when the EntityRuler argument phrase_matcher_attr is
specified.
"""
text = "Spacy is a python library for nlp"
nlp = English()
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"})
ruler.add_patterns(patterns)
doc = nlp(text)
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
nlp_reloaded = English()
with make_tempdir() as d:
file_path = d / "entityruler"
ruler.to_disk(file_path)
nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path)
doc_reloaded = nlp_reloaded(text)
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
assert res == res_reloaded
@pytest.mark.issue(6839)
def test_issue6839(en_vocab):
"""Ensure that PhraseMatcher accepts Span as input"""
# fmt: off
words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]
# fmt: on
doc = Doc(en_vocab, words=words)
span = doc[:8]
pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
matcher = PhraseMatcher(en_vocab)
matcher.add("SPACY", [pattern])
matches = matcher(span)
assert matches
💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * **total execution time down from > 300 seconds to under 60 seconds** 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-07-25 00:38:44 +03:00
@pytest.mark.issue(10643)
def test_issue10643(en_vocab):
"""Ensure overlapping terms can be removed from PhraseMatcher"""
# fmt: off
words = ["Only", "save", "out", "the", "binary", "data", "for", "the", "individual", "components", "."]
# fmt: on
doc = Doc(en_vocab, words=words)
terms = {
"0": Doc(en_vocab, words=["binary"]),
"1": Doc(en_vocab, words=["binary", "data"]),
}
matcher = PhraseMatcher(en_vocab)
for match_id, term in terms.items():
matcher.add(match_id, [term])
matches = matcher(doc)
assert matches == [(en_vocab.strings["0"], 4, 5), (en_vocab.strings["1"], 4, 6)]
matcher.remove("0")
assert len(matcher) == 1
new_matches = matcher(doc)
assert new_matches == [(en_vocab.strings["1"], 4, 6)]
matcher.remove("1")
assert len(matcher) == 0
no_matches = matcher(doc)
assert not no_matches
💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * **total execution time down from > 300 seconds to under 60 seconds** 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-07-25 00:38:44 +03:00
def test_matcher_phrase_matcher(en_vocab):
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
Replace PhraseMatcher with trie-based search (#4309) * Replace PhraseMatcher with Aho-Corasick Replace PhraseMatcher with the Aho-Corasick algorithm over numpy arrays of the hash values for the relevant attribute. The implementation is based on FlashText. The speed should be similar to the previous PhraseMatcher. It is now possible to easily remove match IDs and matches don't go missing with large keyword lists / vocabularies. Fixes #4308. * Restore support for pickling * Fix internal keyword add/remove for numpy arrays * Add missing loop for match ID set in search loop * Remove cruft in matching loop for partial matches There was a bit of unnecessary code left over from FlashText in the matching loop to handle partial token matches, which we don't have with PhraseMatcher. * Replace dict trie with MapStruct trie * Fix how match ID hash is stored/added * Update fix for match ID vocab * Switch from map_get_unless_missing to map_get * Switch from numpy array to Token.get_struct_attr Access token attributes directly in Doc instead of making a copy of the relevant values in a numpy array. Add unsatisfactory warning for hash collision with reserved terminal hash key. (Ideally it would change the reserved terminal hash and redo the whole trie, but for now, I'm hoping there won't be collisions.) * Restructure imports to export find_matches * Implement full remove() Remove unnecessary trie paths and free unused maps. Parallel to Matcher, raise KeyError when attempting to remove a match ID that has not been added. * Store docs internally only as attr lists * Reduces size for pickle * Remove duplicate keywords store Now that docs are stored as lists of attr hashes, there's no need to have the duplicate _keywords store.
2019-09-27 17:22:34 +03:00
# intermediate phrase
pattern = Doc(en_vocab, words=["Google", "Now"])
matcher = PhraseMatcher(en_vocab)
matcher.add("COMPANY", [pattern])
Replace PhraseMatcher with trie-based search (#4309) * Replace PhraseMatcher with Aho-Corasick Replace PhraseMatcher with the Aho-Corasick algorithm over numpy arrays of the hash values for the relevant attribute. The implementation is based on FlashText. The speed should be similar to the previous PhraseMatcher. It is now possible to easily remove match IDs and matches don't go missing with large keyword lists / vocabularies. Fixes #4308. * Restore support for pickling * Fix internal keyword add/remove for numpy arrays * Add missing loop for match ID set in search loop * Remove cruft in matching loop for partial matches There was a bit of unnecessary code left over from FlashText in the matching loop to handle partial token matches, which we don't have with PhraseMatcher. * Replace dict trie with MapStruct trie * Fix how match ID hash is stored/added * Update fix for match ID vocab * Switch from map_get_unless_missing to map_get * Switch from numpy array to Token.get_struct_attr Access token attributes directly in Doc instead of making a copy of the relevant values in a numpy array. Add unsatisfactory warning for hash collision with reserved terminal hash key. (Ideally it would change the reserved terminal hash and redo the whole trie, but for now, I'm hoping there won't be collisions.) * Restructure imports to export find_matches * Implement full remove() Remove unnecessary trie paths and free unused maps. Parallel to Matcher, raise KeyError when attempting to remove a match ID that has not been added. * Store docs internally only as attr lists * Reduces size for pickle * Remove duplicate keywords store Now that docs are stored as lists of attr hashes, there's no need to have the duplicate _keywords store.
2019-09-27 17:22:34 +03:00
assert len(matcher(doc)) == 1
# initial token
pattern = Doc(en_vocab, words=["I"])
matcher = PhraseMatcher(en_vocab)
matcher.add("I", [pattern])
Replace PhraseMatcher with trie-based search (#4309) * Replace PhraseMatcher with Aho-Corasick Replace PhraseMatcher with the Aho-Corasick algorithm over numpy arrays of the hash values for the relevant attribute. The implementation is based on FlashText. The speed should be similar to the previous PhraseMatcher. It is now possible to easily remove match IDs and matches don't go missing with large keyword lists / vocabularies. Fixes #4308. * Restore support for pickling * Fix internal keyword add/remove for numpy arrays * Add missing loop for match ID set in search loop * Remove cruft in matching loop for partial matches There was a bit of unnecessary code left over from FlashText in the matching loop to handle partial token matches, which we don't have with PhraseMatcher. * Replace dict trie with MapStruct trie * Fix how match ID hash is stored/added * Update fix for match ID vocab * Switch from map_get_unless_missing to map_get * Switch from numpy array to Token.get_struct_attr Access token attributes directly in Doc instead of making a copy of the relevant values in a numpy array. Add unsatisfactory warning for hash collision with reserved terminal hash key. (Ideally it would change the reserved terminal hash and redo the whole trie, but for now, I'm hoping there won't be collisions.) * Restructure imports to export find_matches * Implement full remove() Remove unnecessary trie paths and free unused maps. Parallel to Matcher, raise KeyError when attempting to remove a match ID that has not been added. * Store docs internally only as attr lists * Reduces size for pickle * Remove duplicate keywords store Now that docs are stored as lists of attr hashes, there's no need to have the duplicate _keywords store.
2019-09-27 17:22:34 +03:00
assert len(matcher(doc)) == 1
# initial phrase
pattern = Doc(en_vocab, words=["I", "like"])
matcher = PhraseMatcher(en_vocab)
matcher.add("ILIKE", [pattern])
Replace PhraseMatcher with trie-based search (#4309) * Replace PhraseMatcher with Aho-Corasick Replace PhraseMatcher with the Aho-Corasick algorithm over numpy arrays of the hash values for the relevant attribute. The implementation is based on FlashText. The speed should be similar to the previous PhraseMatcher. It is now possible to easily remove match IDs and matches don't go missing with large keyword lists / vocabularies. Fixes #4308. * Restore support for pickling * Fix internal keyword add/remove for numpy arrays * Add missing loop for match ID set in search loop * Remove cruft in matching loop for partial matches There was a bit of unnecessary code left over from FlashText in the matching loop to handle partial token matches, which we don't have with PhraseMatcher. * Replace dict trie with MapStruct trie * Fix how match ID hash is stored/added * Update fix for match ID vocab * Switch from map_get_unless_missing to map_get * Switch from numpy array to Token.get_struct_attr Access token attributes directly in Doc instead of making a copy of the relevant values in a numpy array. Add unsatisfactory warning for hash collision with reserved terminal hash key. (Ideally it would change the reserved terminal hash and redo the whole trie, but for now, I'm hoping there won't be collisions.) * Restructure imports to export find_matches * Implement full remove() Remove unnecessary trie paths and free unused maps. Parallel to Matcher, raise KeyError when attempting to remove a match ID that has not been added. * Store docs internally only as attr lists * Reduces size for pickle * Remove duplicate keywords store Now that docs are stored as lists of attr hashes, there's no need to have the duplicate _keywords store.
2019-09-27 17:22:34 +03:00
assert len(matcher(doc)) == 1
# final token
pattern = Doc(en_vocab, words=["best"])
matcher = PhraseMatcher(en_vocab)
matcher.add("BEST", [pattern])
Replace PhraseMatcher with trie-based search (#4309) * Replace PhraseMatcher with Aho-Corasick Replace PhraseMatcher with the Aho-Corasick algorithm over numpy arrays of the hash values for the relevant attribute. The implementation is based on FlashText. The speed should be similar to the previous PhraseMatcher. It is now possible to easily remove match IDs and matches don't go missing with large keyword lists / vocabularies. Fixes #4308. * Restore support for pickling * Fix internal keyword add/remove for numpy arrays * Add missing loop for match ID set in search loop * Remove cruft in matching loop for partial matches There was a bit of unnecessary code left over from FlashText in the matching loop to handle partial token matches, which we don't have with PhraseMatcher. * Replace dict trie with MapStruct trie * Fix how match ID hash is stored/added * Update fix for match ID vocab * Switch from map_get_unless_missing to map_get * Switch from numpy array to Token.get_struct_attr Access token attributes directly in Doc instead of making a copy of the relevant values in a numpy array. Add unsatisfactory warning for hash collision with reserved terminal hash key. (Ideally it would change the reserved terminal hash and redo the whole trie, but for now, I'm hoping there won't be collisions.) * Restructure imports to export find_matches * Implement full remove() Remove unnecessary trie paths and free unused maps. Parallel to Matcher, raise KeyError when attempting to remove a match ID that has not been added. * Store docs internally only as attr lists * Reduces size for pickle * Remove duplicate keywords store Now that docs are stored as lists of attr hashes, there's no need to have the duplicate _keywords store.
2019-09-27 17:22:34 +03:00
assert len(matcher(doc)) == 1
# final phrase
pattern = Doc(en_vocab, words=["Now", "best"])
matcher = PhraseMatcher(en_vocab)
matcher.add("NOWBEST", [pattern])
💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * **total execution time down from > 300 seconds to under 60 seconds** 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-07-25 00:38:44 +03:00
assert len(matcher(doc)) == 1
def test_phrase_matcher_length(en_vocab):
matcher = PhraseMatcher(en_vocab)
assert len(matcher) == 0
matcher.add("TEST", [Doc(en_vocab, words=["test"])])
💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * **total execution time down from > 300 seconds to under 60 seconds** 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-07-25 00:38:44 +03:00
assert len(matcher) == 1
matcher.add("TEST2", [Doc(en_vocab, words=["test2"])])
💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * **total execution time down from > 300 seconds to under 60 seconds** 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-07-25 00:38:44 +03:00
assert len(matcher) == 2
def test_phrase_matcher_contains(en_vocab):
matcher = PhraseMatcher(en_vocab)
matcher.add("TEST", [Doc(en_vocab, words=["test"])])
assert "TEST" in matcher
assert "TEST2" not in matcher
def test_phrase_matcher_add_new_api(en_vocab):
doc = Doc(en_vocab, words=["a", "b"])
patterns = [Doc(en_vocab, words=["a"]), Doc(en_vocab, words=["a", "b"])]
matcher = PhraseMatcher(en_vocab)
matcher.add("OLD_API", None, *patterns)
assert len(matcher(doc)) == 2
matcher = PhraseMatcher(en_vocab)
on_match = Mock()
matcher.add("OLD_API_CALLBACK", on_match, *patterns)
assert len(matcher(doc)) == 2
assert on_match.call_count == 2
# New API: add(key: str, patterns: List[List[dict]], on_match: Callable)
matcher = PhraseMatcher(en_vocab)
matcher.add("NEW_API", patterns)
assert len(matcher(doc)) == 2
matcher = PhraseMatcher(en_vocab)
on_match = Mock()
matcher.add("NEW_API_CALLBACK", patterns, on_match=on_match)
assert len(matcher(doc)) == 2
assert on_match.call_count == 2
Replace PhraseMatcher with trie-based search (#4309) * Replace PhraseMatcher with Aho-Corasick Replace PhraseMatcher with the Aho-Corasick algorithm over numpy arrays of the hash values for the relevant attribute. The implementation is based on FlashText. The speed should be similar to the previous PhraseMatcher. It is now possible to easily remove match IDs and matches don't go missing with large keyword lists / vocabularies. Fixes #4308. * Restore support for pickling * Fix internal keyword add/remove for numpy arrays * Add missing loop for match ID set in search loop * Remove cruft in matching loop for partial matches There was a bit of unnecessary code left over from FlashText in the matching loop to handle partial token matches, which we don't have with PhraseMatcher. * Replace dict trie with MapStruct trie * Fix how match ID hash is stored/added * Update fix for match ID vocab * Switch from map_get_unless_missing to map_get * Switch from numpy array to Token.get_struct_attr Access token attributes directly in Doc instead of making a copy of the relevant values in a numpy array. Add unsatisfactory warning for hash collision with reserved terminal hash key. (Ideally it would change the reserved terminal hash and redo the whole trie, but for now, I'm hoping there won't be collisions.) * Restructure imports to export find_matches * Implement full remove() Remove unnecessary trie paths and free unused maps. Parallel to Matcher, raise KeyError when attempting to remove a match ID that has not been added. * Store docs internally only as attr lists * Reduces size for pickle * Remove duplicate keywords store Now that docs are stored as lists of attr hashes, there's no need to have the duplicate _keywords store.
2019-09-27 17:22:34 +03:00
def test_phrase_matcher_repeated_add(en_vocab):
matcher = PhraseMatcher(en_vocab)
# match ID only gets added once
matcher.add("TEST", [Doc(en_vocab, words=["like"])])
matcher.add("TEST", [Doc(en_vocab, words=["like"])])
matcher.add("TEST", [Doc(en_vocab, words=["like"])])
matcher.add("TEST", [Doc(en_vocab, words=["like"])])
Replace PhraseMatcher with trie-based search (#4309) * Replace PhraseMatcher with Aho-Corasick Replace PhraseMatcher with the Aho-Corasick algorithm over numpy arrays of the hash values for the relevant attribute. The implementation is based on FlashText. The speed should be similar to the previous PhraseMatcher. It is now possible to easily remove match IDs and matches don't go missing with large keyword lists / vocabularies. Fixes #4308. * Restore support for pickling * Fix internal keyword add/remove for numpy arrays * Add missing loop for match ID set in search loop * Remove cruft in matching loop for partial matches There was a bit of unnecessary code left over from FlashText in the matching loop to handle partial token matches, which we don't have with PhraseMatcher. * Replace dict trie with MapStruct trie * Fix how match ID hash is stored/added * Update fix for match ID vocab * Switch from map_get_unless_missing to map_get * Switch from numpy array to Token.get_struct_attr Access token attributes directly in Doc instead of making a copy of the relevant values in a numpy array. Add unsatisfactory warning for hash collision with reserved terminal hash key. (Ideally it would change the reserved terminal hash and redo the whole trie, but for now, I'm hoping there won't be collisions.) * Restructure imports to export find_matches * Implement full remove() Remove unnecessary trie paths and free unused maps. Parallel to Matcher, raise KeyError when attempting to remove a match ID that has not been added. * Store docs internally only as attr lists * Reduces size for pickle * Remove duplicate keywords store Now that docs are stored as lists of attr hashes, there's no need to have the duplicate _keywords store.
2019-09-27 17:22:34 +03:00
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
assert "TEST" in matcher
assert "TEST2" not in matcher
assert len(matcher(doc)) == 1
def test_phrase_matcher_remove(en_vocab):
matcher = PhraseMatcher(en_vocab)
matcher.add("TEST1", [Doc(en_vocab, words=["like"])])
matcher.add("TEST2", [Doc(en_vocab, words=["best"])])
Replace PhraseMatcher with trie-based search (#4309) * Replace PhraseMatcher with Aho-Corasick Replace PhraseMatcher with the Aho-Corasick algorithm over numpy arrays of the hash values for the relevant attribute. The implementation is based on FlashText. The speed should be similar to the previous PhraseMatcher. It is now possible to easily remove match IDs and matches don't go missing with large keyword lists / vocabularies. Fixes #4308. * Restore support for pickling * Fix internal keyword add/remove for numpy arrays * Add missing loop for match ID set in search loop * Remove cruft in matching loop for partial matches There was a bit of unnecessary code left over from FlashText in the matching loop to handle partial token matches, which we don't have with PhraseMatcher. * Replace dict trie with MapStruct trie * Fix how match ID hash is stored/added * Update fix for match ID vocab * Switch from map_get_unless_missing to map_get * Switch from numpy array to Token.get_struct_attr Access token attributes directly in Doc instead of making a copy of the relevant values in a numpy array. Add unsatisfactory warning for hash collision with reserved terminal hash key. (Ideally it would change the reserved terminal hash and redo the whole trie, but for now, I'm hoping there won't be collisions.) * Restructure imports to export find_matches * Implement full remove() Remove unnecessary trie paths and free unused maps. Parallel to Matcher, raise KeyError when attempting to remove a match ID that has not been added. * Store docs internally only as attr lists * Reduces size for pickle * Remove duplicate keywords store Now that docs are stored as lists of attr hashes, there's no need to have the duplicate _keywords store.
2019-09-27 17:22:34 +03:00
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
assert "TEST1" in matcher
assert "TEST2" in matcher
assert "TEST3" not in matcher
assert len(matcher(doc)) == 2
matcher.remove("TEST1")
assert "TEST1" not in matcher
assert "TEST2" in matcher
assert "TEST3" not in matcher
assert len(matcher(doc)) == 1
matcher.remove("TEST2")
assert "TEST1" not in matcher
assert "TEST2" not in matcher
assert "TEST3" not in matcher
assert len(matcher(doc)) == 0
with pytest.raises(KeyError):
matcher.remove("TEST3")
assert "TEST1" not in matcher
assert "TEST2" not in matcher
assert "TEST3" not in matcher
assert len(matcher(doc)) == 0
def test_phrase_matcher_overlapping_with_remove(en_vocab):
matcher = PhraseMatcher(en_vocab)
matcher.add("TEST", [Doc(en_vocab, words=["like"])])
Replace PhraseMatcher with trie-based search (#4309) * Replace PhraseMatcher with Aho-Corasick Replace PhraseMatcher with the Aho-Corasick algorithm over numpy arrays of the hash values for the relevant attribute. The implementation is based on FlashText. The speed should be similar to the previous PhraseMatcher. It is now possible to easily remove match IDs and matches don't go missing with large keyword lists / vocabularies. Fixes #4308. * Restore support for pickling * Fix internal keyword add/remove for numpy arrays * Add missing loop for match ID set in search loop * Remove cruft in matching loop for partial matches There was a bit of unnecessary code left over from FlashText in the matching loop to handle partial token matches, which we don't have with PhraseMatcher. * Replace dict trie with MapStruct trie * Fix how match ID hash is stored/added * Update fix for match ID vocab * Switch from map_get_unless_missing to map_get * Switch from numpy array to Token.get_struct_attr Access token attributes directly in Doc instead of making a copy of the relevant values in a numpy array. Add unsatisfactory warning for hash collision with reserved terminal hash key. (Ideally it would change the reserved terminal hash and redo the whole trie, but for now, I'm hoping there won't be collisions.) * Restructure imports to export find_matches * Implement full remove() Remove unnecessary trie paths and free unused maps. Parallel to Matcher, raise KeyError when attempting to remove a match ID that has not been added. * Store docs internally only as attr lists * Reduces size for pickle * Remove duplicate keywords store Now that docs are stored as lists of attr hashes, there's no need to have the duplicate _keywords store.
2019-09-27 17:22:34 +03:00
# TEST2 is added alongside TEST
matcher.add("TEST2", [Doc(en_vocab, words=["like"])])
Replace PhraseMatcher with trie-based search (#4309) * Replace PhraseMatcher with Aho-Corasick Replace PhraseMatcher with the Aho-Corasick algorithm over numpy arrays of the hash values for the relevant attribute. The implementation is based on FlashText. The speed should be similar to the previous PhraseMatcher. It is now possible to easily remove match IDs and matches don't go missing with large keyword lists / vocabularies. Fixes #4308. * Restore support for pickling * Fix internal keyword add/remove for numpy arrays * Add missing loop for match ID set in search loop * Remove cruft in matching loop for partial matches There was a bit of unnecessary code left over from FlashText in the matching loop to handle partial token matches, which we don't have with PhraseMatcher. * Replace dict trie with MapStruct trie * Fix how match ID hash is stored/added * Update fix for match ID vocab * Switch from map_get_unless_missing to map_get * Switch from numpy array to Token.get_struct_attr Access token attributes directly in Doc instead of making a copy of the relevant values in a numpy array. Add unsatisfactory warning for hash collision with reserved terminal hash key. (Ideally it would change the reserved terminal hash and redo the whole trie, but for now, I'm hoping there won't be collisions.) * Restructure imports to export find_matches * Implement full remove() Remove unnecessary trie paths and free unused maps. Parallel to Matcher, raise KeyError when attempting to remove a match ID that has not been added. * Store docs internally only as attr lists * Reduces size for pickle * Remove duplicate keywords store Now that docs are stored as lists of attr hashes, there's no need to have the duplicate _keywords store.
2019-09-27 17:22:34 +03:00
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
assert "TEST" in matcher
assert len(matcher) == 2
assert len(matcher(doc)) == 2
# removing TEST does not remove the entry for TEST2
matcher.remove("TEST")
assert "TEST" not in matcher
assert len(matcher) == 1
assert len(matcher(doc)) == 1
assert matcher(doc)[0][0] == en_vocab.strings["TEST2"]
# removing TEST2 removes all
matcher.remove("TEST2")
assert "TEST2" not in matcher
assert len(matcher) == 0
assert len(matcher(doc)) == 0
def test_phrase_matcher_string_attrs(en_vocab):
words1 = ["I", "like", "cats"]
pos1 = ["PRON", "VERB", "NOUN"]
words2 = ["Yes", ",", "you", "hate", "dogs", "very", "much"]
pos2 = ["INTJ", "PUNCT", "PRON", "VERB", "NOUN", "ADV", "ADV"]
2020-09-21 21:43:54 +03:00
pattern = Doc(en_vocab, words=words1, pos=pos1)
matcher = PhraseMatcher(en_vocab, attr="POS")
matcher.add("TEST", [pattern])
2020-09-21 21:43:54 +03:00
doc = Doc(en_vocab, words=words2, pos=pos2)
matches = matcher(doc)
assert len(matches) == 1
match_id, start, end = matches[0]
assert match_id == en_vocab.strings["TEST"]
assert start == 2
assert end == 5
def test_phrase_matcher_string_attrs_negative(en_vocab):
"""Test that token with the control codes as ORTH are *not* matched."""
words1 = ["I", "like", "cats"]
pos1 = ["PRON", "VERB", "NOUN"]
words2 = ["matcher:POS-PRON", "matcher:POS-VERB", "matcher:POS-NOUN"]
pos2 = ["X", "X", "X"]
2020-09-21 21:43:54 +03:00
pattern = Doc(en_vocab, words=words1, pos=pos1)
matcher = PhraseMatcher(en_vocab, attr="POS")
matcher.add("TEST", [pattern])
2020-09-21 21:43:54 +03:00
doc = Doc(en_vocab, words=words2, pos=pos2)
matches = matcher(doc)
assert len(matches) == 0
def test_phrase_matcher_bool_attrs(en_vocab):
words1 = ["Hello", "world", "!"]
words2 = ["No", "problem", ",", "he", "said", "."]
pattern = Doc(en_vocab, words=words1)
matcher = PhraseMatcher(en_vocab, attr="IS_PUNCT")
matcher.add("TEST", [pattern])
doc = Doc(en_vocab, words=words2)
matches = matcher(doc)
assert len(matches) == 2
match_id1, start1, end1 = matches[0]
match_id2, start2, end2 = matches[1]
assert match_id1 == en_vocab.strings["TEST"]
assert match_id2 == en_vocab.strings["TEST"]
assert start1 == 0
assert end1 == 3
assert start2 == 3
assert end2 == 6
def test_phrase_matcher_validation(en_vocab):
doc1 = Doc(en_vocab, words=["Test"])
doc1[0].dep_ = "ROOT"
doc2 = Doc(en_vocab, words=["Test"])
doc2[0].tag_ = "TAG"
doc2[0].pos_ = "X"
doc2[0].set_morph("Feat=Val")
doc3 = Doc(en_vocab, words=["Test"])
matcher = PhraseMatcher(en_vocab, validate=True)
with pytest.warns(UserWarning):
matcher.add("TEST1", [doc1])
with pytest.warns(UserWarning):
matcher.add("TEST2", [doc2])
with warnings.catch_warnings():
warnings.simplefilter("error")
matcher.add("TEST3", [doc3])
matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
with warnings.catch_warnings():
warnings.simplefilter("error")
matcher.add("TEST4", [doc2])
2019-08-21 15:00:37 +03:00
def test_attr_validation(en_vocab):
with pytest.raises(ValueError):
PhraseMatcher(en_vocab, attr="UNSUPPORTED")
def test_attr_pipeline_checks(en_vocab):
doc1 = Doc(en_vocab, words=["Test"])
doc1[0].dep_ = "ROOT"
2019-08-21 15:00:37 +03:00
doc2 = Doc(en_vocab, words=["Test"])
doc2[0].tag_ = "TAG"
doc2[0].pos_ = "X"
doc2[0].set_morph("Feat=Val")
doc2[0].lemma_ = "LEMMA"
2019-08-21 15:00:37 +03:00
doc3 = Doc(en_vocab, words=["Test"])
# DEP requires DEP
2019-08-21 15:00:37 +03:00
matcher = PhraseMatcher(en_vocab, attr="DEP")
matcher.add("TEST1", [doc1])
2019-08-21 15:00:37 +03:00
with pytest.raises(ValueError):
matcher.add("TEST2", [doc2])
2019-08-21 15:00:37 +03:00
with pytest.raises(ValueError):
matcher.add("TEST3", [doc3])
# TAG, POS, LEMMA require those values
2019-08-21 15:00:37 +03:00
for attr in ("TAG", "POS", "LEMMA"):
matcher = PhraseMatcher(en_vocab, attr=attr)
matcher.add("TEST2", [doc2])
2019-08-21 15:00:37 +03:00
with pytest.raises(ValueError):
matcher.add("TEST1", [doc1])
2019-08-21 15:00:37 +03:00
with pytest.raises(ValueError):
matcher.add("TEST3", [doc3])
2019-08-21 15:00:37 +03:00
# TEXT/ORTH only require tokens
matcher = PhraseMatcher(en_vocab, attr="ORTH")
matcher.add("TEST3", [doc3])
2019-08-21 15:00:37 +03:00
matcher = PhraseMatcher(en_vocab, attr="TEXT")
matcher.add("TEST3", [doc3])
def test_phrase_matcher_callback(en_vocab):
mock = Mock()
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
pattern = Doc(en_vocab, words=["Google", "Now"])
matcher = PhraseMatcher(en_vocab)
matcher.add("COMPANY", [pattern], on_match=mock)
matches = matcher(doc)
mock.assert_called_once_with(matcher, doc, 0, matches)
def test_phrase_matcher_remove_overlapping_patterns(en_vocab):
matcher = PhraseMatcher(en_vocab)
pattern1 = Doc(en_vocab, words=["this"])
pattern2 = Doc(en_vocab, words=["this", "is"])
pattern3 = Doc(en_vocab, words=["this", "is", "a"])
pattern4 = Doc(en_vocab, words=["this", "is", "a", "word"])
matcher.add("THIS", [pattern1, pattern2, pattern3, pattern4])
matcher.remove("THIS")
def test_phrase_matcher_basic_check(en_vocab):
matcher = PhraseMatcher(en_vocab)
# Potential mistake: pass in pattern instead of list of patterns
pattern = Doc(en_vocab, words=["hello", "world"])
with pytest.raises(ValueError):
matcher.add("TEST", pattern)
def test_phrase_matcher_pickle(en_vocab):
matcher = PhraseMatcher(en_vocab)
mock = Mock()
matcher.add("TEST", [Doc(en_vocab, words=["test"])])
matcher.add("TEST2", [Doc(en_vocab, words=["test2"])], on_match=mock)
doc = Doc(en_vocab, words=["these", "are", "tests", ":", "test", "test2"])
assert len(matcher) == 2
b = srsly.pickle_dumps(matcher)
matcher_unpickled = srsly.pickle_loads(b)
# call after pickling to avoid recursion error related to mock
matches = matcher(doc)
matches_unpickled = matcher_unpickled(doc)
assert len(matcher) == len(matcher_unpickled)
assert matches == matches_unpickled
# clunky way to vaguely check that callback is unpickled
(vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1]
assert isinstance(callbacks.get("TEST2"), Mock)
2020-08-31 15:53:22 +03:00
def test_phrase_matcher_as_spans(en_vocab):
"""Test the new as_spans=True API."""
matcher = PhraseMatcher(en_vocab)
matcher.add("A", [Doc(en_vocab, words=["hello", "world"])])
matcher.add("B", [Doc(en_vocab, words=["test"])])
doc = Doc(en_vocab, words=["...", "hello", "world", "this", "is", "a", "test"])
matches = matcher(doc, as_spans=True)
assert len(matches) == 2
assert isinstance(matches[0], Span)
assert matches[0].text == "hello world"
assert matches[0].label_ == "A"
assert isinstance(matches[1], Span)
assert matches[1].text == "test"
assert matches[1].label_ == "B"
2020-08-31 18:01:24 +03:00
def test_phrase_matcher_deprecated(en_vocab):
matcher = PhraseMatcher(en_vocab)
matcher.add("TEST", [Doc(en_vocab, words=["helllo"])])
doc = Doc(en_vocab, words=["hello", "world"])
with pytest.warns(DeprecationWarning) as record:
for _ in matcher.pipe([doc]):
pass
assert record.list
assert "spaCy v3.0" in str(record.list[0].message)
@pytest.mark.parametrize("attr", ["SENT_START", "IS_SENT_START"])
def test_phrase_matcher_sent_start(en_vocab, attr):
2021-01-30 04:52:33 +03:00
_ = PhraseMatcher(en_vocab, attr=attr) # noqa: F841
def test_span_in_phrasematcher(en_vocab):
"""Ensure that PhraseMatcher accepts Span and Doc as input"""
2021-02-13 04:55:56 +03:00
# fmt: off
words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]
# fmt: on
doc = Doc(en_vocab, words=words)
span = doc[:8]
pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
matcher = PhraseMatcher(en_vocab)
matcher.add("SPACY", [pattern])
matches_doc = matcher(doc)
matches_span = matcher(span)
assert len(matches_doc) == 1
assert len(matches_span) == 1
def test_span_v_doc_in_phrasematcher(en_vocab):
"""Ensure that PhraseMatcher only returns matches in input Span and not in entire Doc"""
2021-02-13 04:55:56 +03:00
# fmt: off
words = [
"I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "Spans",
"and", "Docs", "in", "my", "matchers", "," "and", "Spans", "and", "Docs",
"everywhere", "."
]
# fmt: on
doc = Doc(en_vocab, words=words)
span = doc[9:15] # second clause
pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
matcher = PhraseMatcher(en_vocab)
matcher.add("SPACY", [pattern])
matches_doc = matcher(doc)
matches_span = matcher(span)
assert len(matches_doc) == 3
assert len(matches_span) == 1