Merge branch 'master' of https://github.com/explosion/spaCy into feature/better-faster-matcher

This commit is contained in:
Matthew Honnibal 2018-02-18 14:40:42 +01:00
commit 530172d57a
16 changed files with 158 additions and 51 deletions

View File

@ -6,7 +6,6 @@ preshed>=1.0.0,<2.0.0
thinc>=6.10.1,<6.11.0
murmurhash>=0.28,<0.29
plac<1.0.0,>=0.9.6
six
ujson>=1.35
dill>=0.2,<0.3
requests>=2.13.0,<3.0.0
@ -16,4 +15,3 @@ pytest>=3.0.6,<4.0.0
mock>=2.0.0,<3.0.0
msgpack-python
msgpack-numpy==0.4.1
html5lib==1.0b8

View File

@ -191,8 +191,6 @@ def setup_package():
'preshed>=1.0.0,<2.0.0',
'thinc>=6.10.1,<6.11.0',
'plac<1.0.0,>=0.9.6',
'six',
'html5lib==1.0b8',
'pathlib',
'ujson>=1.35',
'dill>=0.2,<0.3',

View File

@ -3,13 +3,13 @@
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__title__ = 'spacy'
__version__ = '2.0.8'
__version__ = '2.1.0.dev0'
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
__uri__ = 'https://spacy.io'
__author__ = 'Explosion AI'
__email__ = 'contact@explosion.ai'
__license__ = 'MIT'
__release__ = True
__release__ = False
__docs_models__ = 'https://spacy.io/usage/models'
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'

View File

@ -1,7 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
import six
import ftfy
import sys
import ujson
@ -47,9 +46,10 @@ is_windows = sys.platform.startswith('win')
is_linux = sys.platform.startswith('linux')
is_osx = sys.platform == 'darwin'
is_python2 = six.PY2
is_python3 = six.PY3
is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1]<5)
# See: https://github.com/benjaminp/six/blob/master/six.py
is_python2 = sys.version_info[0] == 2
is_python3 = sys.version_info[0] == 3
is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1] < 5)
if is_python2:
bytes_ = str

View File

@ -144,7 +144,7 @@ def is_lower(string): return string.islower()
def is_space(string): return string.isspace()
def is_title(string): return string.istitle()
def is_upper(string): return string.isupper()
def is_stop(string, stops=set()): return string in stops
def is_stop(string, stops=set()): return string.lower() in stops
def is_oov(string): return True
def get_prob(string): return -20.

View File

@ -637,18 +637,36 @@ cdef class PhraseMatcher:
on_match(self, doc, i, matches)
return matches
def pipe(self, stream, batch_size=1000, n_threads=2):
def pipe(self, stream, batch_size=1000, n_threads=2, return_matches=False,
as_tuples=False):
"""Match a stream of documents, yielding them in turn.
docs (iterable): A stream of documents.
batch_size (int): Number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer
in parallel, if the implementation supports multi-threading.
return_matches (bool): Yield the match lists along with the docs, making
results (doc, matches) tuples.
as_tuples (bool): Interpret the input stream as (doc, context) tuples,
and yield (result, context) tuples out.
If both return_matches and as_tuples are True, the output will
be a sequence of ((doc, matches), context) tuples.
YIELDS (Doc): Documents, in order.
"""
for doc in stream:
self(doc)
yield doc
if as_tuples:
for doc, context in stream:
matches = self(doc)
if return_matches:
yield ((doc, matches), context)
else:
yield (doc, context)
else:
for doc in stream:
matches = self(doc)
if return_matches:
yield (doc, matches)
else:
yield doc
def accept_match(self, Doc doc, int start, int end):
cdef int i, j

View File

@ -85,6 +85,7 @@ cdef enum symbol_t:
SENT_START
SPACY
PROB
LANG
ADJ
ADP
@ -108,8 +109,9 @@ cdef enum symbol_t:
SPACE
Animacy_anim
Animacy_inam
Animacy_inan
Animacy_hum # U20
Animacy_nhum
Aspect_freq
Aspect_imp
Aspect_mod
@ -393,6 +395,7 @@ cdef enum symbol_t:
EVENT
WORK_OF_ART
LANGUAGE
LAW
DATE
TIME
@ -451,10 +454,9 @@ cdef enum symbol_t:
prt
punct
quantmod
relcl
rcmod
root
xcomp
acl
LAW
LANG

View File

@ -114,8 +114,9 @@ IDS = {
"SPACE": SPACE,
"Animacy_anim": Animacy_anim,
"Animacy_inam": Animacy_inam,
"Animacy_inam": Animacy_inan,
"Animacy_hum": Animacy_hum, # U20
"Animacy_nhum": Animacy_nhum,
"Aspect_freq": Aspect_freq,
"Aspect_imp": Aspect_imp,
"Aspect_mod": Aspect_mod,
@ -458,6 +459,7 @@ IDS = {
"punct": punct,
"quantmod": quantmod,
"rcmod": rcmod,
"relcl": relcl,
"root": root,
"xcomp": xcomp,

View File

@ -19,6 +19,15 @@ def doc(en_tokenizer):
return get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
@pytest.fixture
def doc_not_parsed(en_tokenizer):
text = "This is a sentence. This is another sentence. And a third."
tokens = en_tokenizer(text)
d = get_doc(tokens.vocab, [t.text for t in tokens])
d.is_parsed = False
return d
def test_spans_sent_spans(doc):
sents = list(doc.sents)
assert sents[0].start == 0
@ -34,6 +43,7 @@ def test_spans_root(doc):
assert span.root.text == 'sentence'
assert span.root.head.text == 'is'
def test_spans_string_fn(doc):
span = doc[0:4]
assert len(span) == 4
@ -41,6 +51,7 @@ def test_spans_string_fn(doc):
assert span.upper_ == 'THIS IS A SENTENCE'
assert span.lower_ == 'this is a sentence'
def test_spans_root2(en_tokenizer):
text = "through North and South Carolina"
heads = [0, 3, -1, -2, -4]
@ -49,12 +60,17 @@ def test_spans_root2(en_tokenizer):
assert doc[-2:].root.text == 'Carolina'
def test_spans_span_sent(doc):
def test_spans_span_sent(doc, doc_not_parsed):
"""Test span.sent property"""
assert len(list(doc.sents))
assert doc[:2].sent.root.text == 'is'
assert doc[:2].sent.text == 'This is a sentence .'
assert doc[6:7].sent.root.left_edge.text == 'This'
# test on manual sbd
doc_not_parsed[0].is_sent_start = True
doc_not_parsed[5].is_sent_start = True
assert doc_not_parsed[1:3].sent == doc_not_parsed[0:5]
assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]
def test_spans_lca_matrix(en_tokenizer):

View File

@ -2,9 +2,9 @@
from __future__ import unicode_literals
from ....parts_of_speech import SPACE
from ....compat import unicode_
from ...util import get_doc
import six
import pytest
@ -24,8 +24,8 @@ def test_tag_names(EN):
text = "I ate pizzas with anchovies."
doc = EN(text, disable=['parser'])
assert type(doc[2].pos) == int
assert isinstance(doc[2].pos_, six.text_type)
assert isinstance(doc[2].dep_, six.text_type)
assert isinstance(doc[2].pos_, unicode_)
assert isinstance(doc[2].dep_, unicode_)
assert doc[2].tag_ == u'NNS'

View File

@ -0,0 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
from ...lang.lex_attrs import is_stop
from ...lang.en.stop_words import STOP_WORDS
import pytest
@pytest.mark.parametrize('word', ['the'])
def test_lex_attrs_stop_words_case_sensitivity(word):
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)

View File

@ -186,6 +186,20 @@ cdef class Doc:
def _(self):
return Underscore(Underscore.doc_extensions, self)
@property
def is_sentenced(self):
# Check if the document has sentence boundaries,
# i.e at least one tok has the sent_start in (-1, 1)
if 'sents' in self.user_hooks:
return True
if self.is_parsed:
return True
for i in range(self.length):
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
return True
else:
return False
def __getitem__(self, object i):
"""Get a `Token` or `Span` object.
@ -515,29 +529,23 @@ cdef class Doc:
>>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
"""
def __get__(self):
if not self.is_sentenced:
raise ValueError(
"Sentence boundaries unset. You can add the 'sentencizer' "
"component to the pipeline with: "
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
"Alternatively, add the dependency parser, or set "
"sentence boundaries by setting doc[i].sent_start")
if 'sents' in self.user_hooks:
yield from self.user_hooks['sents'](self)
return
cdef int i
if not self.is_parsed:
else:
start = 0
for i in range(1, self.length):
if self.c[i].sent_start != 0:
break
else:
raise ValueError(
"Sentence boundaries unset. You can add the 'sentencizer' "
"component to the pipeline with: "
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
"Alternatively, add the dependency parser, or set "
"sentence boundaries by setting doc[i].sent_start")
start = 0
for i in range(1, self.length):
if self.c[i].sent_start == 1:
yield Span(self, start, i)
start = i
if start != self.length:
yield Span(self, start, self.length)
if self.c[i].sent_start == 1:
yield Span(self, start, i)
start = i
if start != self.length:
yield Span(self, start, self.length)
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
if self.length == 0:

View File

@ -285,16 +285,39 @@ cdef class Span:
def __get__(self):
if 'sent' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['sent'](self)
# This should raise if we're not parsed.
# This should raise if we're not parsed
# or doesen't have any sbd component :)
self.doc.sents
# if doc is parsed we can use the deps to find the sentence
# otherwise we use the `sent_start` token attribute
cdef int n = 0
root = &self.doc.c[self.start]
while root.head != 0:
root += root.head
n += 1
if n >= self.doc.length:
raise RuntimeError
return self.doc[root.l_edge:root.r_edge + 1]
cdef int i
if self.doc.is_parsed:
root = &self.doc.c[self.start]
while root.head != 0:
root += root.head
n += 1
if n >= self.doc.length:
raise RuntimeError
return self.doc[root.l_edge:root.r_edge + 1]
elif self.doc.is_sentenced:
# find start of the sentence
start = self.start
while self.doc.c[start].sent_start != 1 and start > 0:
start += -1
# find end of the sentence
end = self.end
while self.doc.c[end].sent_start != 1:
end += 1
if n >= self.doc.length:
break
#
return self.doc[start:end]
else:
raise ValueError(
"Access to sentence requires either the dependency parse "
"or sentence boundaries to be set by setting " +
"doc[i].is_sent_start = True")
property has_vector:
"""RETURNS (bool): Whether a word vector is associated with the object.

View File

@ -325,6 +325,12 @@ p The L2 norm of the lexeme's vector representation.
+cell bool
+cell Is the lexeme a quotation mark?
+row
+cell #[code is_currency]
+tag-new("2.0.8")
+cell bool
+cell Is the lexeme a currency symbol?
+row
+cell #[code like_url]
+cell bool

View File

@ -111,6 +111,25 @@ p Match a stream of documents, yielding them in turn.
| parallel, if the #[code Matcher] implementation supports
| multi-threading.
+row
+cell #[code return_matches]
+tag-new(2.1)
+cell bool
+cell
| Yield the match lists along with the docs, making results
| #[code (doc, matches)] tuples.
+row
+cell #[code as_tuples]
+tag-new(2.1)
+cell bool
+cell
| Interpret the input stream as #[code (doc, context)] tuples, and
| yield #[code (result, context)] tuples out. If both
| #[code return_matches] and #[code as_tuples] are #[code True],
| the output will be a sequence of
| #[code ((doc, matches), context)] tuples.
+row("foot")
+cell yields
+cell #[code Doc]

View File

@ -740,6 +740,12 @@ p The L2 norm of the token's vector representation.
+cell bool
+cell Is the token a quotation mark?
+row
+cell #[code is_currency]
+tag-new("2.0.8")
+cell bool
+cell Is the token a currency symbol?
+row
+cell #[code like_url]
+cell bool