Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-05-22 10:41:56 -05:00
commit 0264447c4d
21 changed files with 128 additions and 237 deletions

View File

@ -372,7 +372,7 @@ cdef class Matcher:
ent_id = state.second.attrs[0].value ent_id = state.second.attrs[0].value
label = state.second.attrs[0].value label = state.second.attrs[0].value
matches.append((ent_id, start, end)) matches.append((ent_id, start, end))
for i, (ent_id, label, start, end) in enumerate(matches): for i, (ent_id, start, end) in enumerate(matches):
on_match = self._callbacks.get(ent_id) on_match = self._callbacks.get(ent_id)
if on_match is not None: if on_match is not None:
on_match(self, doc, i, matches) on_match(self, doc, i, matches)

View File

@ -7,9 +7,12 @@ from libc.string cimport memcpy
from libc.stdint cimport uint64_t, uint32_t from libc.stdint cimport uint64_t, uint32_t
from murmurhash.mrmr cimport hash64, hash32 from murmurhash.mrmr cimport hash64, hash32
from preshed.maps cimport map_iter, key_t from preshed.maps cimport map_iter, key_t
from libc.stdint cimport uint32_t
import ujson
import dill
from .typedefs cimport hash_t from .typedefs cimport hash_t
from libc.stdint cimport uint32_t from . import util
cpdef hash_t hash_string(unicode string) except 0: cpdef hash_t hash_string(unicode string) except 0:
@ -92,14 +95,6 @@ cdef class StringStore:
def __get__(self): def __get__(self):
return self.size -1 return self.size -1
def __reduce__(self):
# TODO: OOV words, for the is_frozen stuff?
if self.is_frozen:
raise NotImplementedError(
"Currently missing support for pickling StringStore when "
"is_frozen=True")
return (StringStore, (list(self),))
def __len__(self): def __len__(self):
"""The number of strings in the store. """The number of strings in the store.
@ -186,7 +181,10 @@ cdef class StringStore:
path (unicode or Path): A path to a directory, which will be created if path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects. it doesn't exist. Paths may be either strings or `Path`-like objects.
""" """
raise NotImplementedError() path = util.ensure_path(path)
strings = list(self)
with path.open('w') as file_:
ujson.dump(strings, file_)
def from_disk(self, path): def from_disk(self, path):
"""Loads state from a directory. Modifies the object in place and """Loads state from a directory. Modifies the object in place and
@ -196,7 +194,11 @@ cdef class StringStore:
strings or `Path`-like objects. strings or `Path`-like objects.
RETURNS (StringStore): The modified `StringStore` object. RETURNS (StringStore): The modified `StringStore` object.
""" """
raise NotImplementedError() path = util.ensure_path(path)
with path.open('r') as file_:
strings = ujson.load(file_)
self._reset_and_load(strings)
return self
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
"""Serialize the current state to a binary string. """Serialize the current state to a binary string.
@ -204,7 +206,7 @@ cdef class StringStore:
**exclude: Named attributes to prevent from being serialized. **exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `StringStore` object. RETURNS (bytes): The serialized form of the `StringStore` object.
""" """
raise NotImplementedError() return ujson.dumps(list(self))
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string. """Load state from a binary string.
@ -213,7 +215,9 @@ cdef class StringStore:
**exclude: Named attributes to prevent from being loaded. **exclude: Named attributes to prevent from being loaded.
RETURNS (StringStore): The `StringStore` object. RETURNS (StringStore): The `StringStore` object.
""" """
raise NotImplementedError() strings = ujson.loads(bytes_data)
self._reset_and_load(strings)
return self
def set_frozen(self, bint is_frozen): def set_frozen(self, bint is_frozen):
# TODO # TODO
@ -222,6 +226,17 @@ cdef class StringStore:
def flush_oov(self): def flush_oov(self):
self._oov = PreshMap() self._oov = PreshMap()
def _reset_and_load(self, strings, freeze=False):
self.mem = Pool()
self._map = PreshMap()
self._oov = PreshMap()
self._resize_at = 10000
self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
self.size = 1
for string in strings:
_ = self[string]
self.is_frozen = freeze
cdef const Utf8Str* intern_unicode(self, unicode py_string): cdef const Utf8Str* intern_unicode(self, unicode py_string):
# 0 means missing, but we don't bother offsetting the index. # 0 means missing, but we don't bother offsetting the index.
cdef bytes byte_string = py_string.encode('utf8') cdef bytes byte_string = py_string.encode('utf8')

View File

@ -1,53 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from ...matcher import Matcher
from ...attrs import ORTH
from ..util import get_doc
import pytest
@pytest.mark.parametrize('words,entity', [
(["Test", "Entity"], "TestEntity")])
def test_matcher_add_empty_entity(en_vocab, words, entity):
matcher = Matcher(en_vocab)
matcher.add_entity(entity)
doc = get_doc(en_vocab, words)
assert matcher.n_patterns == 0
assert matcher(doc) == []
@pytest.mark.parametrize('entity1,entity2,attrs', [
("TestEntity", "TestEntity2", {"Hello": "World"})])
def test_matcher_get_entity_attrs(en_vocab, entity1, entity2, attrs):
matcher = Matcher(en_vocab)
matcher.add_entity(entity1)
assert matcher.get_entity(entity1) == {}
matcher.add_entity(entity2, attrs=attrs)
assert matcher.get_entity(entity2) == attrs
assert matcher.get_entity(entity1) == {}
@pytest.mark.parametrize('words,entity,attrs',
[(["Test", "Entity"], "TestEntity", {"Hello": "World"})])
def test_matcher_get_entity_via_match(en_vocab, words, entity, attrs):
matcher = Matcher(en_vocab)
matcher.add_entity(entity, attrs=attrs)
doc = get_doc(en_vocab, words)
assert matcher.n_patterns == 0
assert matcher(doc) == []
matcher.add_pattern(entity, [{ORTH: words[0]}, {ORTH: words[1]}])
assert matcher.n_patterns == 1
matches = matcher(doc)
assert len(matches) == 1
assert len(matches[0]) == 4
ent_id, label, start, end = matches[0]
assert ent_id == matcher.vocab.strings[entity]
assert label == 0
assert start == 0
assert end == 2
assert matcher.get_entity(ent_id) == attrs

View File

@ -21,7 +21,6 @@ def test_simple_types(EN):
def test_consistency_bug(EN): def test_consistency_bug(EN):
'''Test an arbitrary sequence-consistency bug encountered during speed test''' '''Test an arbitrary sequence-consistency bug encountered during speed test'''
tokens = EN(u'Where rap essentially went mainstream, illustrated by seminal Public Enemy, Beastie Boys and L.L. Cool J. tracks.') tokens = EN(u'Where rap essentially went mainstream, illustrated by seminal Public Enemy, Beastie Boys and L.L. Cool J. tracks.')
tokens = EN(u'''Charity and other short-term aid have buoyed them so far, and a tax-relief bill working its way through Congress would help. But the September 11 Victim Compensation Fund, enacted by Congress to discourage people from filing lawsuits, will determine the shape of their lives for years to come.\n\n''', entity=False) tokens = EN(u'''Charity and other short-term aid have buoyed them so far, and a tax-relief bill working its way through Congress would help. But the September 11 Victim Compensation Fund, enacted by Congress to discourage people from filing lawsuits, will determine the shape of their lives for years to come.\n\n''', entity=False)
tokens.ents += tuple(EN.matcher(tokens)) tokens.ents += tuple(EN.matcher(tokens))
EN.entity(tokens) EN.entity(tokens)
@ -30,17 +29,8 @@ def test_consistency_bug(EN):
@pytest.mark.models @pytest.mark.models
def test_unit_end_gazetteer(EN): def test_unit_end_gazetteer(EN):
'''Test a bug in the interaction between the NER model and the gazetteer''' '''Test a bug in the interaction between the NER model and the gazetteer'''
matcher = Matcher(EN.vocab, matcher = Matcher(EN.vocab)
{'MemberNames': matcher.add('MemberNames', None, [{LOWER: 'cal'}], [{LOWER: 'cal'}, {LOWER: 'henderson'}])
('PERSON', {},
[
[{LOWER: 'cal'}],
[{LOWER: 'cal'}, {LOWER: 'henderson'}],
]
)
}
)
doc = EN(u'who is cal the manager of?') doc = EN(u'who is cal the manager of?')
if len(list(doc.ents)) == 0: if len(list(doc.ents)) == 0:
ents = matcher(doc) ents = matcher(doc)
@ -50,4 +40,4 @@ def test_unit_end_gazetteer(EN):
assert list(doc.ents)[0].text == 'cal' assert list(doc.ents)[0].text == 'cal'

View File

@ -2,15 +2,14 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from ...matcher import Matcher from ...matcher import Matcher
from ...attrs import ORTH, LOWER
import pytest import pytest
pattern1 = [[{LOWER: 'celtics'}], [{LOWER: 'boston'}, {LOWER: 'celtics'}]] pattern1 = [[{'LOWER': 'celtics'}], [{'LOWER': 'boston'}, {'LOWER': 'celtics'}]]
pattern2 = [[{LOWER: 'boston'}, {LOWER: 'celtics'}], [{LOWER: 'celtics'}]] pattern2 = [[{'LOWER': 'boston'}, {'LOWER': 'celtics'}], [{'LOWER': 'celtics'}]]
pattern3 = [[{LOWER: 'boston'}], [{LOWER: 'boston'}, {LOWER: 'celtics'}]] pattern3 = [[{'LOWER': 'boston'}], [{'LOWER': 'boston'}, {'LOWER': 'celtics'}]]
pattern4 = [[{LOWER: 'boston'}, {LOWER: 'celtics'}], [{LOWER: 'boston'}]] pattern4 = [[{'LOWER': 'boston'}, {'LOWER': 'celtics'}], [{'LOWER': 'boston'}]]
@pytest.fixture @pytest.fixture
@ -24,10 +23,11 @@ def doc(en_tokenizer):
def test_issue118(doc, pattern): def test_issue118(doc, pattern):
"""Test a bug that arose from having overlapping matches""" """Test a bug that arose from having overlapping matches"""
ORG = doc.vocab.strings['ORG'] ORG = doc.vocab.strings['ORG']
matcher = Matcher(doc.vocab, {'BostonCeltics': ('ORG', {}, pattern)}) matcher = Matcher(doc.vocab)
matcher.add("BostonCeltics", None, *pattern)
assert len(list(doc.ents)) == 0 assert len(list(doc.ents)) == 0
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)] matches = [(ORG, start, end) for _, start, end in matcher(doc)]
assert matches == [(ORG, 9, 11), (ORG, 10, 11)] assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
doc.ents = matches[:1] doc.ents = matches[:1]
ents = list(doc.ents) ents = list(doc.ents)
@ -41,10 +41,11 @@ def test_issue118(doc, pattern):
def test_issue118_prefix_reorder(doc, pattern): def test_issue118_prefix_reorder(doc, pattern):
"""Test a bug that arose from having overlapping matches""" """Test a bug that arose from having overlapping matches"""
ORG = doc.vocab.strings['ORG'] ORG = doc.vocab.strings['ORG']
matcher = Matcher(doc.vocab, {'BostonCeltics': ('ORG', {}, pattern)}) matcher = Matcher(doc.vocab)
matcher.add('BostonCeltics', None, *pattern)
assert len(list(doc.ents)) == 0 assert len(list(doc.ents)) == 0
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)] matches = [(ORG, start, end) for _, start, end in matcher(doc)]
doc.ents += tuple(matches)[1:] doc.ents += tuple(matches)[1:]
assert matches == [(ORG, 9, 10), (ORG, 9, 11)] assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
ents = doc.ents ents = doc.ents

View File

@ -2,7 +2,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from ...matcher import Matcher from ...matcher import Matcher
from ...attrs import LOWER
import pytest import pytest
@ -10,14 +9,14 @@ import pytest
def test_issue242(en_tokenizer): def test_issue242(en_tokenizer):
"""Test overlapping multi-word phrases.""" """Test overlapping multi-word phrases."""
text = "There are different food safety standards in different countries." text = "There are different food safety standards in different countries."
patterns = [[{LOWER: 'food'}, {LOWER: 'safety'}], patterns = [[{'LOWER': 'food'}, {'LOWER': 'safety'}],
[{LOWER: 'safety'}, {LOWER: 'standards'}]] [{'LOWER': 'safety'}, {'LOWER': 'standards'}]]
doc = en_tokenizer(text) doc = en_tokenizer(text)
matcher = Matcher(doc.vocab) matcher = Matcher(doc.vocab)
matcher.add('FOOD', 'FOOD', {}, patterns) matcher.add('FOOD', None, *patterns)
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)] matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)]
doc.ents += tuple(matches) doc.ents += tuple(matches)
match1, match2 = matches match1, match2 = matches
assert match1[1] == 3 assert match1[1] == 3

View File

@ -1,7 +1,6 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from ...attrs import ORTH
from ...matcher import Matcher from ...matcher import Matcher
import pytest import pytest
@ -12,13 +11,13 @@ def test_issue429(EN):
def merge_phrases(matcher, doc, i, matches): def merge_phrases(matcher, doc, i, matches):
if i != len(matches) - 1: if i != len(matches) - 1:
return None return None
spans = [(ent_id, label, doc[start:end]) for ent_id, label, start, end in matches] spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
for ent_id, label, span in spans: for ent_id, label, span in spans:
span.merge('NNP' if label else span.root.tag_, span.text, EN.vocab.strings[label]) span.merge('NNP' if label else span.root.tag_, span.text, EN.vocab.strings[label])
doc = EN('a') doc = EN('a')
matcher = Matcher(EN.vocab) matcher = Matcher(EN.vocab)
matcher.add('key', label='TEST', attrs={}, specs=[[{ORTH: 'a'}]], on_match=merge_phrases) matcher.add('TEST', on_match=merge_phrases, [{'ORTH': 'a'}])
doc = EN.tokenizer('a b c') doc = EN.tokenizer('a b c')
EN.tagger(doc) EN.tagger(doc)
matcher(doc) matcher(doc)

View File

@ -7,14 +7,16 @@ from ...attrs import IS_PUNCT, ORTH
import pytest import pytest
@pytest.mark.models def test_issue587(en_tokenizer):
def test_issue587(EN):
"""Test that Matcher doesn't segfault on particular input""" """Test that Matcher doesn't segfault on particular input"""
matcher = Matcher(EN.vocab) doc = en_tokenizer('a b; c')
content = '''a b; c''' matcher = Matcher(doc.vocab)
matcher.add(entity_key='1', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}]]) matcher.add('TEST1', None, [{ORTH: 'a'}, {ORTH: 'b'}])
matcher(EN(content)) matches = matcher(doc)
matcher.add(entity_key='2', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}]]) assert len(matches) == 1
matcher(EN(content)) matcher.add('TEST2', None, [{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}])
matcher.add(entity_key='3', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'd'}]]) matches = matcher(doc)
matcher(EN(content)) assert len(matches) == 2
matcher.add('TEST3', None, [{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'd'}])
matches = matcher(doc)
assert len(matches) == 2

View File

@ -9,4 +9,4 @@ import pytest
def test_issue588(en_vocab): def test_issue588(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
with pytest.raises(ValueError): with pytest.raises(ValueError):
matcher.add(entity_key='1', label='TEST', attrs={}, specs=[[]]) matcher.add('TEST', None, [])

View File

@ -1,7 +1,6 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from ...attrs import ORTH, IS_ALPHA, LIKE_NUM
from ...matcher import Matcher from ...matcher import Matcher
from ..util import get_doc from ..util import get_doc
@ -9,14 +8,8 @@ from ..util import get_doc
def test_issue590(en_vocab): def test_issue590(en_vocab):
"""Test overlapping matches""" """Test overlapping matches"""
doc = get_doc(en_vocab, ['n', '=', '1', ';', 'a', ':', '5', '%']) doc = get_doc(en_vocab, ['n', '=', '1', ';', 'a', ':', '5', '%'])
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add_entity("ab", acceptor=None, on_match=None) matcher.add('ab', None, [{'IS_ALPHA': True}, {'ORTH': ':'}, {'LIKE_NUM': True}, {'ORTH': '%'}])
matcher.add_pattern('ab', [{IS_ALPHA: True}, {ORTH: ':'}, matcher.add('ab', None, [{'IS_ALPHA': True}, {'ORTH': '='}, {'LIKE_NUM': True}])
{LIKE_NUM: True}, {ORTH: '%'}],
label='a')
matcher.add_pattern('ab', [{IS_ALPHA: True}, {ORTH: '='},
{LIKE_NUM: True}],
label='b')
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 2 assert len(matches) == 2

View File

@ -1,21 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from ...attrs import ORTH
from ...matcher import Matcher
from ..util import get_doc
def test_issue605(en_vocab):
def return_false(doc, ent_id, label, start, end):
return False
words = ["The", "golf", "club", "is", "broken"]
pattern = [{ORTH: "golf"}, {ORTH: "club"}]
label = "Sport_Equipment"
doc = get_doc(en_vocab, words)
matcher = Matcher(doc.vocab)
matcher.add_entity(label, acceptor=return_false)
matcher.add_pattern(label, pattern)
match = matcher(doc)
assert match == []

View File

@ -2,7 +2,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from ...matcher import Matcher from ...matcher import Matcher
from ...attrs import ORTH
def test_issue615(en_tokenizer): def test_issue615(en_tokenizer):
@ -14,19 +13,17 @@ def test_issue615(en_tokenizer):
if i != len(matches)-1: if i != len(matches)-1:
return None return None
# Get Span objects # Get Span objects
spans = [(ent_id, label, doc[start : end]) for ent_id, label, start, end in matches] spans = [(ent_id, ent_id, doc[start : end]) for ent_id, start, end in matches]
for ent_id, label, span in spans: for ent_id, label, span in spans:
span.merge('NNP' if label else span.root.tag_, span.text, doc.vocab.strings[label]) span.merge('NNP' if label else span.root.tag_, span.text, doc.vocab.strings[label])
text = "The golf club is broken" text = "The golf club is broken"
pattern = [{ORTH: "golf"}, {ORTH: "club"}] pattern = [{'ORTH': "golf"}, {'ORTH': "club"}]
label = "Sport_Equipment" label = "Sport_Equipment"
doc = en_tokenizer(text) doc = en_tokenizer(text)
matcher = Matcher(doc.vocab) matcher = Matcher(doc.vocab)
matcher.add_entity(label, on_match=merge_phrases) matcher.add(label, merge_phrases, pattern)
matcher.add_pattern(label, pattern, label=label)
match = matcher(doc) match = matcher(doc)
entities = list(doc.ents) entities = list(doc.ents)

View File

@ -1,16 +1,13 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from ... import load as load_spacy
from ...attrs import LEMMA
from ...matcher import merge_phrase
import pytest import pytest
@pytest.mark.xfail
@pytest.mark.models @pytest.mark.models
def test_issue758(): def test_issue758(EN):
'''Test parser transition bug after label added.''' '''Test parser transition bug after label added.'''
nlp = load_spacy('en') from ...matcher import merge_phrase
nlp.matcher.add('splash', 'my_entity', {}, nlp = EN()
[[{LEMMA: 'splash'}, {LEMMA: 'on'}]], nlp.matcher.add('splash', merge_phrase, [[{'LEMMA': 'splash'}, {'LEMMA': 'on'}]])
on_match=merge_phrase)
doc = nlp('splash On', parse=False) doc = nlp('splash On', parse=False)

View File

@ -1,8 +1,5 @@
''' # coding: utf-8
Test Matcher matches with '*' operator and Boolean flag from __future__ import unicode_literals, print_function
'''
from __future__ import unicode_literals
from __future__ import print_function
import pytest import pytest
from ...matcher import Matcher from ...matcher import Matcher
@ -12,41 +9,30 @@ from ...tokens import Doc
def test_basic_case(): def test_basic_case():
"""Test Matcher matches with '*' operator and Boolean flag"""
matcher = Matcher(Vocab( matcher = Matcher(Vocab(
lex_attr_getters={LOWER: lambda string: string.lower()})) lex_attr_getters={LOWER: lambda string: string.lower()}))
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True) IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
matcher.add_pattern( matcher.add('FarAway', None, [{'LOWER': "bob"}, {'OP': '*', 'LOWER': 'and'}, {'LOWER': 'frank'}])
"FarAway",
[
{LOWER: "bob"},
{'OP': '*', LOWER: 'and'},
{LOWER: 'frank'}
])
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank']) doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
match = matcher(doc) match = matcher(doc)
assert len(match) == 1 assert len(match) == 1
ent_id, label, start, end = match[0] ent_id, start, end = match[0]
assert start == 0 assert start == 0
assert end == 4 assert end == 4
@pytest.mark.xfail @pytest.mark.xfail
def test_issue850(): def test_issue850():
'''The problem here is that the variable-length pattern matches the """The problem here is that the variable-length pattern matches the
succeeding token. We then don't handle the ambiguity correctly.''' succeeding token. We then don't handle the ambiguity correctly."""
matcher = Matcher(Vocab( matcher = Matcher(Vocab(
lex_attr_getters={LOWER: lambda string: string.lower()})) lex_attr_getters={LOWER: lambda string: string.lower()}))
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True) IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
matcher.add_pattern( matcher.add('FarAway', None, [{'LOWER': "bob"}, {'OP': '*', 'IS_ANY_TOKEN': True}, {'LOWER': 'frank'}])
"FarAway",
[
{LOWER: "bob"},
{'OP': '*', IS_ANY_TOKEN: True},
{LOWER: 'frank'}
])
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank']) doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
match = matcher(doc) match = matcher(doc)
assert len(match) == 1 assert len(match) == 1
ent_id, label, start, end = match[0] ent_id, start, end = match[0]
assert start == 0 assert start == 0
assert end == 4 assert end == 4

View File

@ -69,10 +69,8 @@ def test_stringstore_massive_strings(stringstore):
@pytest.mark.parametrize('text', ["qqqqq"]) @pytest.mark.parametrize('text', ["qqqqq"])
def test_stringstore_dump_load(stringstore, text_file, text): def test_stringstore_to_bytes(stringstore, text):
store = stringstore[text] store = stringstore[text]
stringstore.dump(text_file) serialized = stringstore.to_bytes()
text_file.seek(0) new_stringstore = StringStore().from_bytes(serialized)
new_stringstore = StringStore()
new_stringstore.load(text_file)
assert new_stringstore[store] == text assert new_stringstore[store] == text

View File

@ -9,19 +9,22 @@ import pytest
@pytest.fixture @pytest.fixture
def matcher(en_vocab): def matcher(en_vocab):
patterns = { rules = {
'JS': ['PRODUCT', {}, [[{'ORTH': 'JavaScript'}]]], 'JS': [[{'ORTH': 'JavaScript'}]],
'GoogleNow': ['PRODUCT', {}, [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]]], 'GoogleNow': [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]],
'Java': ['PRODUCT', {}, [[{'LOWER': 'java'}]]] 'Java': [[{'LOWER': 'java'}]]
} }
return Matcher(en_vocab, patterns) matcher = Matcher(en_vocab)
for key, patterns in rules.items():
matcher.add(key, None, *patterns)
return matcher
@pytest.mark.parametrize('words', [["Some", "words"]]) @pytest.mark.parametrize('words', [["Some", "words"]])
def test_matcher_init(en_vocab, words): def test_matcher_init(en_vocab, words):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
doc = get_doc(en_vocab, words) doc = get_doc(en_vocab, words)
assert matcher.n_patterns == 0 assert len(matcher) == 0
assert matcher(doc) == [] assert matcher(doc) == []
@ -32,39 +35,35 @@ def test_matcher_no_match(matcher):
def test_matcher_compile(matcher): def test_matcher_compile(matcher):
assert matcher.n_patterns == 3 assert len(matcher) == 3
def test_matcher_match_start(matcher): def test_matcher_match_start(matcher):
words = ["JavaScript", "is", "good"] words = ["JavaScript", "is", "good"]
doc = get_doc(matcher.vocab, words) doc = get_doc(matcher.vocab, words)
assert matcher(doc) == [(matcher.vocab.strings['JS'], assert matcher(doc) == [(matcher.vocab.strings['JS'], 0, 1)]
matcher.vocab.strings['PRODUCT'], 0, 1)]
def test_matcher_match_end(matcher): def test_matcher_match_end(matcher):
words = ["I", "like", "java"] words = ["I", "like", "java"]
doc = get_doc(matcher.vocab, words) doc = get_doc(matcher.vocab, words)
assert matcher(doc) == [(doc.vocab.strings['Java'], assert matcher(doc) == [(doc.vocab.strings['Java'], 2, 3)]
doc.vocab.strings['PRODUCT'], 2, 3)]
def test_matcher_match_middle(matcher): def test_matcher_match_middle(matcher):
words = ["I", "like", "Google", "Now", "best"] words = ["I", "like", "Google", "Now", "best"]
doc = get_doc(matcher.vocab, words) doc = get_doc(matcher.vocab, words)
assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4)]
doc.vocab.strings['PRODUCT'], 2, 4)]
def test_matcher_match_multi(matcher): def test_matcher_match_multi(matcher):
words = ["I", "like", "Google", "Now", "and", "java", "best"] words = ["I", "like", "Google", "Now", "and", "java", "best"]
doc = get_doc(matcher.vocab, words) doc = get_doc(matcher.vocab, words)
assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4),
doc.vocab.strings['PRODUCT'], 2, 4), (doc.vocab.strings['Java'], 5, 6)]
(doc.vocab.strings['Java'],
doc.vocab.strings['PRODUCT'], 5, 6)]
@pytest.mark.xfail
def test_matcher_phrase_matcher(en_vocab): def test_matcher_phrase_matcher(en_vocab):
words = ["Google", "Now"] words = ["Google", "Now"]
doc = get_doc(en_vocab, words) doc = get_doc(en_vocab, words)
@ -87,13 +86,13 @@ def test_matcher_match_zero(matcher):
{'IS_PUNCT': True}, {'IS_PUNCT': True},
{'ORTH': '"'}] {'ORTH': '"'}]
matcher.add('Quote', '', {}, [pattern1]) matcher.add('Quote', None, pattern1)
doc = get_doc(matcher.vocab, words1) doc = get_doc(matcher.vocab, words1)
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1
doc = get_doc(matcher.vocab, words2) doc = get_doc(matcher.vocab, words2)
assert len(matcher(doc)) == 0 assert len(matcher(doc)) == 0
matcher.add('Quote', '', {}, [pattern2]) matcher.add('Quote', None, pattern2)
assert len(matcher(doc)) == 0 assert len(matcher(doc)) == 0
@ -102,24 +101,18 @@ def test_matcher_match_zero_plus(matcher):
pattern = [{'ORTH': '"'}, pattern = [{'ORTH': '"'},
{'OP': '*', 'IS_PUNCT': False}, {'OP': '*', 'IS_PUNCT': False},
{'ORTH': '"'}] {'ORTH': '"'}]
matcher.add('Quote', '', {}, [pattern]) matcher.add('Quote', None, pattern)
doc = get_doc(matcher.vocab, words) doc = get_doc(matcher.vocab, words)
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1
def test_matcher_match_one_plus(matcher): def test_matcher_match_one_plus(matcher):
control = Matcher(matcher.vocab) control = Matcher(matcher.vocab)
control.add_pattern('BasicPhilippe', control.add('BasicPhilippe', None, [{'ORTH': 'Philippe'}])
[{'ORTH': 'Philippe'}], label=321)
doc = get_doc(control.vocab, ['Philippe', 'Philippe']) doc = get_doc(control.vocab, ['Philippe', 'Philippe'])
m = control(doc) m = control(doc)
assert len(m) == 2 assert len(m) == 2
matcher.add_pattern('KleenePhilippe', matcher.add('KleenePhilippe', None, [{'ORTH': 'Philippe', 'OP': '1'},
[ {'ORTH': 'Philippe', 'OP': '+'}])
{'ORTH': 'Philippe', 'OP': '1'},
{'ORTH': 'Philippe', 'OP': '+'}], label=321)
m = matcher(doc) m = matcher(doc)
assert len(m) == 1 assert len(m) == 1

View File

@ -20,9 +20,8 @@ p Create the rule-based #[code Matcher].
+aside-code("Example"). +aside-code("Example").
from spacy.matcher import Matcher from spacy.matcher import Matcher
from spacy.attrs import LOWER
patterns = {"HelloWorld": [{LOWER: "hello"}, {LOWER: "world"}]} patterns = {'HelloWorld': [{'LOWER': 'hello'}, {'LOWER': 'world'}]}
matcher = Matcher(nlp.vocab) matcher = Matcher(nlp.vocab)
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
@ -50,10 +49,9 @@ p Find all token sequences matching the supplied patterns on the #[code Doc].
+aside-code("Example"). +aside-code("Example").
from spacy.matcher import Matcher from spacy.matcher import Matcher
from spacy.attrs import LOWER
matcher = Matcher(nlp.vocab) matcher = Matcher(nlp.vocab)
pattern = [{LOWER: "hello"}, {LOWER: "world"}] pattern = [{'LOWER': "hello"}, {'LOWER': "world"}]
matcher.add("HelloWorld", on_match=None, pattern) matcher.add("HelloWorld", on_match=None, pattern)
doc = nlp(u'hello world!') doc = nlp(u'hello world!')
matches = matcher(doc) matches = matcher(doc)
@ -129,7 +127,7 @@ p
+aside-code("Example"). +aside-code("Example").
matcher = Matcher(nlp.vocab) matcher = Matcher(nlp.vocab)
assert len(matcher) == 0 assert len(matcher) == 0
matcher.add('Rule', None, [{ORTH: 'test'}]) matcher.add('Rule', None, [{'ORTH': 'test'}])
assert len(matcher) == 1 assert len(matcher) == 1
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
@ -146,7 +144,7 @@ p Check whether the matcher contains rules for a match ID.
+aside-code("Example"). +aside-code("Example").
matcher = Matcher(nlp.vocab) matcher = Matcher(nlp.vocab)
assert 'Rule' in matcher == False assert 'Rule' in matcher == False
matcher.add('Rule', None, [{ORTH: 'test'}]) matcher.add('Rule', None, [{'ORTH': 'test'}])
assert 'Rule' in matcher == True assert 'Rule' in matcher == True
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
@ -175,8 +173,8 @@ p
print('Matched!', matches) print('Matched!', matches)
matcher = Matcher(nlp.vocab) matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', on_match, [{LOWER: "hello"}, {LOWER: "world"}]) matcher.add('HelloWorld', on_match, [{'LOWER': 'hello'}, {'LOWER': 'world'}])
matcher.add('GoogleMaps', on_match, [{ORTH: "Google"}, {ORTH: "Maps"}]) matcher.add('GoogleMaps', on_match, [{'ORTH': 'Google'}, {'ORTH': 'Maps'}])
doc = nlp(u'HELLO WORLD on Google Maps.') doc = nlp(u'HELLO WORLD on Google Maps.')
matches = matcher(doc) matches = matcher(doc)
@ -208,7 +206,7 @@ p
| ID does not exist. | ID does not exist.
+aside-code("Example"). +aside-code("Example").
matcher.add('Rule', None, [{ORTH: 'test'}]) matcher.add('Rule', None, [{'ORTH': 'test'}])
assert 'Rule' in matcher == True assert 'Rule' in matcher == True
matcher.remove('Rule') matcher.remove('Rule')
assert 'Rule' in matcher == False assert 'Rule' in matcher == False
@ -228,7 +226,7 @@ p
| patterns. | patterns.
+aside-code("Example"). +aside-code("Example").
pattern = [{ORTH: 'test'}] pattern = [{'ORTH': 'test'}]
matcher.add('Rule', None, pattern) matcher.add('Rule', None, pattern)
(on_match, patterns) = matcher.get('Rule') (on_match, patterns) = matcher.get('Rule')
assert patterns = [pattern] assert patterns = [pattern]

View File

@ -231,7 +231,7 @@ p
data_path = Path('/some/path') data_path = Path('/some/path')
if not path.exists(): if not path.exists():
util.prints("Can't find the path.", data_path, util.prints("Can't find the path.", data_path,
title="Error", exits=True) title="Error", exits=1)
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
@ -243,5 +243,6 @@ p
+cell #[code **kwargs] +cell #[code **kwargs]
+cell - +cell -
+cell +cell
| #[code title] is rendered as coloured headline. #[code exits=True] | #[code title] is rendered as coloured headline. #[code exits]
| performs system exit after printing. | performs system exit after printing, using the value of the
| argument as the exit code, e.g. #[code exits=1].

View File

@ -47,9 +47,7 @@ include _models-list
| The old models are also #[+a(gh("spacy") + "/tree/v1.6.0") attached to the v1.6.0 release]. | The old models are also #[+a(gh("spacy") + "/tree/v1.6.0") attached to the v1.6.0 release].
| To download and install them manually, unpack the archive, drop the | To download and install them manually, unpack the archive, drop the
| contained directory into #[code spacy/data] and load the model via | contained directory into #[code spacy/data].
| #[code spacy.load('en')] or #[code spacy.load('de')].
p p
| The easiest way to download a model is via spaCy's | The easiest way to download a model is via spaCy's
| #[+api("cli#download") #[code download]] command. It takes care of | #[+api("cli#download") #[code download]] command. It takes care of
@ -142,7 +140,7 @@ p
doc = nlp(u'This is a sentence.') doc = nlp(u'This is a sentence.')
+aside("Tip: Preview model info") +infobox("Tip: Preview model info")
| You can use the #[+api("cli#info") #[code info]] command or | You can use the #[+api("cli#info") #[code info]] command or
| #[+api("spacy#info") #[code spacy.info()]] method to print a model's meta data | #[+api("spacy#info") #[code spacy.info()]] method to print a model's meta data
| before loading it. Each #[code Language] object with a loaded model also | before loading it. Each #[code Language] object with a loaded model also

View File

@ -30,7 +30,7 @@ p
| or "WORLD". | or "WORLD".
+code. +code.
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}] [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}]
p p
| First, we initialise the #[code Matcher] with a vocab. The matcher must | First, we initialise the #[code Matcher] with a vocab. The matcher must
@ -43,13 +43,12 @@ p
+code. +code.
import spacy import spacy
from spacy.matcher import Matcher from spacy.matcher import Matcher
from spacy.attrs import LOWER, IS_PUNCT # don't forget to import the attrs!
nlp = spacy.load('en') nlp = spacy.load('en')
matcher = Matcher(nlp.vocab) matcher = Matcher(nlp.vocab)
# add match ID "HelloWorld" with no callback and one pattern # add match ID "HelloWorld" with no callback and one pattern
matcher.add('HelloWorld', on_match=None, matcher.add('HelloWorld', on_match=None,
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}]) [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}])
doc = nlp(u'Hello, world! Hello world!') doc = nlp(u'Hello, world! Hello world!')
matches = matcher(doc) matches = matcher(doc)
@ -63,8 +62,8 @@ p
+code. +code.
matcher.add('HelloWorld', on_match=None, matcher.add('HelloWorld', on_match=None,
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}], [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}],
[{LOWER: 'hello'}, {LOWER: 'world'}]) [{'LOWER': 'hello'}, {'LOWER': 'world'}])
p p
| By default, the matcher will only return the matches and | By default, the matcher will only return the matches and
@ -92,14 +91,13 @@ p
+code. +code.
import spacy import spacy
from spacy.matcher import Matcher from spacy.matcher import Matcher
from spacy.attrs import ORTH, UPPER, LOWER, IS_DIGIT
nlp = spacy.load('en') nlp = spacy.load('en')
matcher = Matcher(nlp.vocab) matcher = Matcher(nlp.vocab)
matcher.add('GoogleIO', on_match=add_event_ent, matcher.add('GoogleIO', on_match=add_event_ent,
[{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}], [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}],
[{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}, {IS_DIGIT: True}]) [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}, {'IS_DIGIT': True}])
# Get the ID of the 'EVENT' entity type. This is required to set an entity. # Get the ID of the 'EVENT' entity type. This is required to set an entity.
EVENT = nlp.vocab.strings['EVENT'] EVENT = nlp.vocab.strings['EVENT']
@ -120,8 +118,8 @@ p
+code. +code.
matcher.add('BAD_HTML', on_match=merge_and_flag, matcher.add('BAD_HTML', on_match=merge_and_flag,
[{ORTH: '&lt;'}, {LOWER: 'br'}, {ORTH: '&gt;'}], [{'ORTH': '&lt;'}, {'LOWER': 'br'}, {'ORTH': '&gt;'}],
[{ORTH: '&lt;'}, {LOWER: 'br/'}, {ORTH: '&gt;'}]) [{'ORTH': '&lt;'}, {'LOWER': 'br/'}, {'ORTH': '&gt;'}])
# Add a new custom flag to the vocab, which is always False by default. # Add a new custom flag to the vocab, which is always False by default.
# BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span. # BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.