mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-03 19:08:06 +03:00
Merge branch 'master' of https://github.com/explosion/spaCy into feature/better-faster-matcher
This commit is contained in:
commit
530172d57a
|
@ -6,7 +6,6 @@ preshed>=1.0.0,<2.0.0
|
|||
thinc>=6.10.1,<6.11.0
|
||||
murmurhash>=0.28,<0.29
|
||||
plac<1.0.0,>=0.9.6
|
||||
six
|
||||
ujson>=1.35
|
||||
dill>=0.2,<0.3
|
||||
requests>=2.13.0,<3.0.0
|
||||
|
@ -16,4 +15,3 @@ pytest>=3.0.6,<4.0.0
|
|||
mock>=2.0.0,<3.0.0
|
||||
msgpack-python
|
||||
msgpack-numpy==0.4.1
|
||||
html5lib==1.0b8
|
||||
|
|
2
setup.py
2
setup.py
|
@ -191,8 +191,6 @@ def setup_package():
|
|||
'preshed>=1.0.0,<2.0.0',
|
||||
'thinc>=6.10.1,<6.11.0',
|
||||
'plac<1.0.0,>=0.9.6',
|
||||
'six',
|
||||
'html5lib==1.0b8',
|
||||
'pathlib',
|
||||
'ujson>=1.35',
|
||||
'dill>=0.2,<0.3',
|
||||
|
|
|
@ -3,13 +3,13 @@
|
|||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||
|
||||
__title__ = 'spacy'
|
||||
__version__ = '2.0.8'
|
||||
__version__ = '2.1.0.dev0'
|
||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
||||
__uri__ = 'https://spacy.io'
|
||||
__author__ = 'Explosion AI'
|
||||
__email__ = 'contact@explosion.ai'
|
||||
__license__ = 'MIT'
|
||||
__release__ = True
|
||||
__release__ = False
|
||||
|
||||
__docs_models__ = 'https://spacy.io/usage/models'
|
||||
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import six
|
||||
import ftfy
|
||||
import sys
|
||||
import ujson
|
||||
|
@ -47,9 +46,10 @@ is_windows = sys.platform.startswith('win')
|
|||
is_linux = sys.platform.startswith('linux')
|
||||
is_osx = sys.platform == 'darwin'
|
||||
|
||||
is_python2 = six.PY2
|
||||
is_python3 = six.PY3
|
||||
is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1]<5)
|
||||
# See: https://github.com/benjaminp/six/blob/master/six.py
|
||||
is_python2 = sys.version_info[0] == 2
|
||||
is_python3 = sys.version_info[0] == 3
|
||||
is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1] < 5)
|
||||
|
||||
if is_python2:
|
||||
bytes_ = str
|
||||
|
|
|
@ -144,7 +144,7 @@ def is_lower(string): return string.islower()
|
|||
def is_space(string): return string.isspace()
|
||||
def is_title(string): return string.istitle()
|
||||
def is_upper(string): return string.isupper()
|
||||
def is_stop(string, stops=set()): return string in stops
|
||||
def is_stop(string, stops=set()): return string.lower() in stops
|
||||
def is_oov(string): return True
|
||||
def get_prob(string): return -20.
|
||||
|
||||
|
|
|
@ -637,17 +637,35 @@ cdef class PhraseMatcher:
|
|||
on_match(self, doc, i, matches)
|
||||
return matches
|
||||
|
||||
def pipe(self, stream, batch_size=1000, n_threads=2):
|
||||
def pipe(self, stream, batch_size=1000, n_threads=2, return_matches=False,
|
||||
as_tuples=False):
|
||||
"""Match a stream of documents, yielding them in turn.
|
||||
|
||||
docs (iterable): A stream of documents.
|
||||
batch_size (int): Number of documents to accumulate into a working set.
|
||||
n_threads (int): The number of threads with which to work on the buffer
|
||||
in parallel, if the implementation supports multi-threading.
|
||||
return_matches (bool): Yield the match lists along with the docs, making
|
||||
results (doc, matches) tuples.
|
||||
as_tuples (bool): Interpret the input stream as (doc, context) tuples,
|
||||
and yield (result, context) tuples out.
|
||||
If both return_matches and as_tuples are True, the output will
|
||||
be a sequence of ((doc, matches), context) tuples.
|
||||
YIELDS (Doc): Documents, in order.
|
||||
"""
|
||||
if as_tuples:
|
||||
for doc, context in stream:
|
||||
matches = self(doc)
|
||||
if return_matches:
|
||||
yield ((doc, matches), context)
|
||||
else:
|
||||
yield (doc, context)
|
||||
else:
|
||||
for doc in stream:
|
||||
self(doc)
|
||||
matches = self(doc)
|
||||
if return_matches:
|
||||
yield (doc, matches)
|
||||
else:
|
||||
yield doc
|
||||
|
||||
def accept_match(self, Doc doc, int start, int end):
|
||||
|
|
|
@ -85,6 +85,7 @@ cdef enum symbol_t:
|
|||
SENT_START
|
||||
SPACY
|
||||
PROB
|
||||
LANG
|
||||
|
||||
ADJ
|
||||
ADP
|
||||
|
@ -108,8 +109,9 @@ cdef enum symbol_t:
|
|||
SPACE
|
||||
|
||||
Animacy_anim
|
||||
Animacy_inam
|
||||
Animacy_inan
|
||||
Animacy_hum # U20
|
||||
Animacy_nhum
|
||||
Aspect_freq
|
||||
Aspect_imp
|
||||
Aspect_mod
|
||||
|
@ -393,6 +395,7 @@ cdef enum symbol_t:
|
|||
EVENT
|
||||
WORK_OF_ART
|
||||
LANGUAGE
|
||||
LAW
|
||||
|
||||
DATE
|
||||
TIME
|
||||
|
@ -451,10 +454,9 @@ cdef enum symbol_t:
|
|||
prt
|
||||
punct
|
||||
quantmod
|
||||
relcl
|
||||
rcmod
|
||||
root
|
||||
xcomp
|
||||
|
||||
acl
|
||||
LAW
|
||||
LANG
|
||||
|
|
|
@ -114,8 +114,9 @@ IDS = {
|
|||
"SPACE": SPACE,
|
||||
|
||||
"Animacy_anim": Animacy_anim,
|
||||
"Animacy_inam": Animacy_inam,
|
||||
"Animacy_inam": Animacy_inan,
|
||||
"Animacy_hum": Animacy_hum, # U20
|
||||
"Animacy_nhum": Animacy_nhum,
|
||||
"Aspect_freq": Aspect_freq,
|
||||
"Aspect_imp": Aspect_imp,
|
||||
"Aspect_mod": Aspect_mod,
|
||||
|
@ -458,6 +459,7 @@ IDS = {
|
|||
"punct": punct,
|
||||
"quantmod": quantmod,
|
||||
"rcmod": rcmod,
|
||||
"relcl": relcl,
|
||||
"root": root,
|
||||
"xcomp": xcomp,
|
||||
|
||||
|
|
|
@ -19,6 +19,15 @@ def doc(en_tokenizer):
|
|||
return get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def doc_not_parsed(en_tokenizer):
|
||||
text = "This is a sentence. This is another sentence. And a third."
|
||||
tokens = en_tokenizer(text)
|
||||
d = get_doc(tokens.vocab, [t.text for t in tokens])
|
||||
d.is_parsed = False
|
||||
return d
|
||||
|
||||
|
||||
def test_spans_sent_spans(doc):
|
||||
sents = list(doc.sents)
|
||||
assert sents[0].start == 0
|
||||
|
@ -34,6 +43,7 @@ def test_spans_root(doc):
|
|||
assert span.root.text == 'sentence'
|
||||
assert span.root.head.text == 'is'
|
||||
|
||||
|
||||
def test_spans_string_fn(doc):
|
||||
span = doc[0:4]
|
||||
assert len(span) == 4
|
||||
|
@ -41,6 +51,7 @@ def test_spans_string_fn(doc):
|
|||
assert span.upper_ == 'THIS IS A SENTENCE'
|
||||
assert span.lower_ == 'this is a sentence'
|
||||
|
||||
|
||||
def test_spans_root2(en_tokenizer):
|
||||
text = "through North and South Carolina"
|
||||
heads = [0, 3, -1, -2, -4]
|
||||
|
@ -49,12 +60,17 @@ def test_spans_root2(en_tokenizer):
|
|||
assert doc[-2:].root.text == 'Carolina'
|
||||
|
||||
|
||||
def test_spans_span_sent(doc):
|
||||
def test_spans_span_sent(doc, doc_not_parsed):
|
||||
"""Test span.sent property"""
|
||||
assert len(list(doc.sents))
|
||||
assert doc[:2].sent.root.text == 'is'
|
||||
assert doc[:2].sent.text == 'This is a sentence .'
|
||||
assert doc[6:7].sent.root.left_edge.text == 'This'
|
||||
# test on manual sbd
|
||||
doc_not_parsed[0].is_sent_start = True
|
||||
doc_not_parsed[5].is_sent_start = True
|
||||
assert doc_not_parsed[1:3].sent == doc_not_parsed[0:5]
|
||||
assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]
|
||||
|
||||
|
||||
def test_spans_lca_matrix(en_tokenizer):
|
||||
|
|
|
@ -2,9 +2,9 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from ....parts_of_speech import SPACE
|
||||
from ....compat import unicode_
|
||||
from ...util import get_doc
|
||||
|
||||
import six
|
||||
import pytest
|
||||
|
||||
|
||||
|
@ -24,8 +24,8 @@ def test_tag_names(EN):
|
|||
text = "I ate pizzas with anchovies."
|
||||
doc = EN(text, disable=['parser'])
|
||||
assert type(doc[2].pos) == int
|
||||
assert isinstance(doc[2].pos_, six.text_type)
|
||||
assert isinstance(doc[2].dep_, six.text_type)
|
||||
assert isinstance(doc[2].pos_, unicode_)
|
||||
assert isinstance(doc[2].dep_, unicode_)
|
||||
assert doc[2].tag_ == u'NNS'
|
||||
|
||||
|
||||
|
|
11
spacy/tests/regression/test_issue1889.py
Normal file
11
spacy/tests/regression/test_issue1889.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
from ...lang.lex_attrs import is_stop
|
||||
from ...lang.en.stop_words import STOP_WORDS
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('word', ['the'])
|
||||
def test_lex_attrs_stop_words_case_sensitivity(word):
|
||||
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
|
|
@ -186,6 +186,20 @@ cdef class Doc:
|
|||
def _(self):
|
||||
return Underscore(Underscore.doc_extensions, self)
|
||||
|
||||
@property
|
||||
def is_sentenced(self):
|
||||
# Check if the document has sentence boundaries,
|
||||
# i.e at least one tok has the sent_start in (-1, 1)
|
||||
if 'sents' in self.user_hooks:
|
||||
return True
|
||||
if self.is_parsed:
|
||||
return True
|
||||
for i in range(self.length):
|
||||
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def __getitem__(self, object i):
|
||||
"""Get a `Token` or `Span` object.
|
||||
|
||||
|
@ -515,22 +529,16 @@ cdef class Doc:
|
|||
>>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'sents' in self.user_hooks:
|
||||
yield from self.user_hooks['sents'](self)
|
||||
return
|
||||
|
||||
cdef int i
|
||||
if not self.is_parsed:
|
||||
for i in range(1, self.length):
|
||||
if self.c[i].sent_start != 0:
|
||||
break
|
||||
else:
|
||||
if not self.is_sentenced:
|
||||
raise ValueError(
|
||||
"Sentence boundaries unset. You can add the 'sentencizer' "
|
||||
"component to the pipeline with: "
|
||||
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
|
||||
"Alternatively, add the dependency parser, or set "
|
||||
"sentence boundaries by setting doc[i].sent_start")
|
||||
if 'sents' in self.user_hooks:
|
||||
yield from self.user_hooks['sents'](self)
|
||||
else:
|
||||
start = 0
|
||||
for i in range(1, self.length):
|
||||
if self.c[i].sent_start == 1:
|
||||
|
|
|
@ -285,9 +285,14 @@ cdef class Span:
|
|||
def __get__(self):
|
||||
if 'sent' in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks['sent'](self)
|
||||
# This should raise if we're not parsed.
|
||||
# This should raise if we're not parsed
|
||||
# or doesen't have any sbd component :)
|
||||
self.doc.sents
|
||||
# if doc is parsed we can use the deps to find the sentence
|
||||
# otherwise we use the `sent_start` token attribute
|
||||
cdef int n = 0
|
||||
cdef int i
|
||||
if self.doc.is_parsed:
|
||||
root = &self.doc.c[self.start]
|
||||
while root.head != 0:
|
||||
root += root.head
|
||||
|
@ -295,6 +300,24 @@ cdef class Span:
|
|||
if n >= self.doc.length:
|
||||
raise RuntimeError
|
||||
return self.doc[root.l_edge:root.r_edge + 1]
|
||||
elif self.doc.is_sentenced:
|
||||
# find start of the sentence
|
||||
start = self.start
|
||||
while self.doc.c[start].sent_start != 1 and start > 0:
|
||||
start += -1
|
||||
# find end of the sentence
|
||||
end = self.end
|
||||
while self.doc.c[end].sent_start != 1:
|
||||
end += 1
|
||||
if n >= self.doc.length:
|
||||
break
|
||||
#
|
||||
return self.doc[start:end]
|
||||
else:
|
||||
raise ValueError(
|
||||
"Access to sentence requires either the dependency parse "
|
||||
"or sentence boundaries to be set by setting " +
|
||||
"doc[i].is_sent_start = True")
|
||||
|
||||
property has_vector:
|
||||
"""RETURNS (bool): Whether a word vector is associated with the object.
|
||||
|
|
|
@ -325,6 +325,12 @@ p The L2 norm of the lexeme's vector representation.
|
|||
+cell bool
|
||||
+cell Is the lexeme a quotation mark?
|
||||
|
||||
+row
|
||||
+cell #[code is_currency]
|
||||
+tag-new("2.0.8")
|
||||
+cell bool
|
||||
+cell Is the lexeme a currency symbol?
|
||||
|
||||
+row
|
||||
+cell #[code like_url]
|
||||
+cell bool
|
||||
|
|
|
@ -111,6 +111,25 @@ p Match a stream of documents, yielding them in turn.
|
|||
| parallel, if the #[code Matcher] implementation supports
|
||||
| multi-threading.
|
||||
|
||||
+row
|
||||
+cell #[code return_matches]
|
||||
+tag-new(2.1)
|
||||
+cell bool
|
||||
+cell
|
||||
| Yield the match lists along with the docs, making results
|
||||
| #[code (doc, matches)] tuples.
|
||||
|
||||
+row
|
||||
+cell #[code as_tuples]
|
||||
+tag-new(2.1)
|
||||
+cell bool
|
||||
+cell
|
||||
| Interpret the input stream as #[code (doc, context)] tuples, and
|
||||
| yield #[code (result, context)] tuples out. If both
|
||||
| #[code return_matches] and #[code as_tuples] are #[code True],
|
||||
| the output will be a sequence of
|
||||
| #[code ((doc, matches), context)] tuples.
|
||||
|
||||
+row("foot")
|
||||
+cell yields
|
||||
+cell #[code Doc]
|
||||
|
|
|
@ -740,6 +740,12 @@ p The L2 norm of the token's vector representation.
|
|||
+cell bool
|
||||
+cell Is the token a quotation mark?
|
||||
|
||||
+row
|
||||
+cell #[code is_currency]
|
||||
+tag-new("2.0.8")
|
||||
+cell bool
|
||||
+cell Is the token a currency symbol?
|
||||
|
||||
+row
|
||||
+cell #[code like_url]
|
||||
+cell bool
|
||||
|
|
Loading…
Reference in New Issue
Block a user