Merge branch 'master' of https://github.com/explosion/spaCy into feature/better-faster-matcher

2025-12-16 06:34:20 +03:00 · 2018-02-18 14:40:42 +01:00 · 2018-02-18 14:40:42 +01:00 · 530172d57a
commit 530172d57a
parent 70cd94f866 c9eeceba00
16 changed files with 158 additions and 51 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -6,7 +6,6 @@ preshed>=1.0.0,<2.0.0
 thinc>=6.10.1,<6.11.0
 murmurhash>=0.28,<0.29
 plac<1.0.0,>=0.9.6
-six
 ujson>=1.35
 dill>=0.2,<0.3
 requests>=2.13.0,<3.0.0
@ -16,4 +15,3 @@ pytest>=3.0.6,<4.0.0
 mock>=2.0.0,<3.0.0
 msgpack-python
 msgpack-numpy==0.4.1
-html5lib==1.0b8
--- a/setup.py
+++ b/setup.py
@ -191,8 +191,6 @@ def setup_package():
                'preshed>=1.0.0,<2.0.0',
                'thinc>=6.10.1,<6.11.0',
                'plac<1.0.0,>=0.9.6',
-                'six',
-                'html5lib==1.0b8',
                'pathlib',
                'ujson>=1.35',
                'dill>=0.2,<0.3',
--- a/spacy/about.py
+++ b/spacy/about.py
@ -3,13 +3,13 @@
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py

 __title__ = 'spacy'
-__version__ = '2.0.8'
+__version__ = '2.1.0.dev0'
 __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
 __uri__ = 'https://spacy.io'
 __author__ = 'Explosion AI'
 __email__ = 'contact@explosion.ai'
 __license__ = 'MIT'
-__release__ = True
+__release__ = False

 __docs_models__ = 'https://spacy.io/usage/models'
 __download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-import six
 import ftfy
 import sys
 import ujson
@ -47,9 +46,10 @@ is_windows = sys.platform.startswith('win')
 is_linux = sys.platform.startswith('linux')
 is_osx = sys.platform == 'darwin'

-is_python2 = six.PY2
-is_python3 = six.PY3
-is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1]<5)
+# See: https://github.com/benjaminp/six/blob/master/six.py
+is_python2 = sys.version_info[0] == 2
+is_python3 = sys.version_info[0] == 3
+is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1] < 5)

 if is_python2:
    bytes_ = str
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@ -144,7 +144,7 @@ def is_lower(string): return string.islower()
 def is_space(string): return string.isspace()
 def is_title(string): return string.istitle()
 def is_upper(string): return string.isupper()
-def is_stop(string, stops=set()): return string in stops
+def is_stop(string, stops=set()): return string.lower() in stops
 def is_oov(string): return True
 def get_prob(string): return -20.

--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -637,18 +637,36 @@ cdef class PhraseMatcher:
                on_match(self, doc, i, matches)
        return matches

-    def pipe(self, stream, batch_size=1000, n_threads=2):
+    def pipe(self, stream, batch_size=1000, n_threads=2, return_matches=False,
+             as_tuples=False):
        """Match a stream of documents, yielding them in turn.

        docs (iterable): A stream of documents.
        batch_size (int): Number of documents to accumulate into a working set.
        n_threads (int): The number of threads with which to work on the buffer
            in parallel, if the implementation supports multi-threading.
+        return_matches (bool): Yield the match lists along with the docs, making
+            results (doc, matches) tuples.
+        as_tuples (bool): Interpret the input stream as (doc, context) tuples,
+            and yield (result, context) tuples out.
+            If both return_matches and as_tuples are True, the output will
+            be a sequence of ((doc, matches), context) tuples.
        YIELDS (Doc): Documents, in order.
        """
-        for doc in stream:
-            self(doc)
-            yield doc
+        if as_tuples:
+            for doc, context in stream:
+                matches = self(doc)
+                if return_matches:
+                    yield ((doc, matches), context)
+                else:
+                    yield (doc, context)
+        else:
+            for doc in stream:
+                matches = self(doc)
+                if return_matches:
+                    yield (doc, matches) 
+                else:
+                    yield doc

    def accept_match(self, Doc doc, int start, int end):
        cdef int i, j
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -85,6 +85,7 @@ cdef enum symbol_t:
    SENT_START
    SPACY
    PROB
+    LANG

    ADJ
    ADP
@ -108,8 +109,9 @@ cdef enum symbol_t:
    SPACE

    Animacy_anim
-    Animacy_inam
+    Animacy_inan
    Animacy_hum # U20
+    Animacy_nhum
    Aspect_freq
    Aspect_imp
    Aspect_mod
@ -393,6 +395,7 @@ cdef enum symbol_t:
    EVENT
    WORK_OF_ART
    LANGUAGE
+    LAW

    DATE
    TIME
@ -451,10 +454,9 @@ cdef enum symbol_t:
    prt
    punct
    quantmod
+    relcl
    rcmod
    root
    xcomp

    acl
-    LAW
-    LANG
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -114,8 +114,9 @@ IDS = {
    "SPACE": SPACE,

    "Animacy_anim": Animacy_anim,
-    "Animacy_inam": Animacy_inam,
+    "Animacy_inam": Animacy_inan,
    "Animacy_hum": Animacy_hum, # U20
+    "Animacy_nhum": Animacy_nhum,
    "Aspect_freq": Aspect_freq,
    "Aspect_imp": Aspect_imp,
    "Aspect_mod": Aspect_mod,
@ -458,6 +459,7 @@ IDS = {
    "punct": punct,
    "quantmod": quantmod,
    "rcmod": rcmod,
+    "relcl": relcl,
    "root": root,
    "xcomp": xcomp,

--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -19,6 +19,15 @@ def doc(en_tokenizer):
    return get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)


+@pytest.fixture
+def doc_not_parsed(en_tokenizer):
+    text = "This is a sentence. This is another sentence. And a third."
+    tokens = en_tokenizer(text)
+    d = get_doc(tokens.vocab, [t.text for t in tokens])
+    d.is_parsed = False
+    return d
+
+
 def test_spans_sent_spans(doc):
    sents = list(doc.sents)
    assert sents[0].start == 0
@ -34,6 +43,7 @@ def test_spans_root(doc):
    assert span.root.text == 'sentence'
    assert span.root.head.text == 'is'

+
 def test_spans_string_fn(doc):
    span = doc[0:4]
    assert len(span) == 4
@ -41,6 +51,7 @@ def test_spans_string_fn(doc):
    assert span.upper_ == 'THIS IS A SENTENCE'
    assert span.lower_ == 'this is a sentence'

+
 def test_spans_root2(en_tokenizer):
    text = "through North and South Carolina"
    heads = [0, 3, -1, -2, -4]
@ -49,12 +60,17 @@ def test_spans_root2(en_tokenizer):
    assert doc[-2:].root.text == 'Carolina'


-def test_spans_span_sent(doc):
+def test_spans_span_sent(doc, doc_not_parsed):
    """Test span.sent property"""
    assert len(list(doc.sents))
    assert doc[:2].sent.root.text == 'is'
    assert doc[:2].sent.text == 'This is a sentence .'
    assert doc[6:7].sent.root.left_edge.text == 'This'
+    # test on manual sbd
+    doc_not_parsed[0].is_sent_start = True
+    doc_not_parsed[5].is_sent_start = True
+    assert doc_not_parsed[1:3].sent == doc_not_parsed[0:5]
+    assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]


 def test_spans_lca_matrix(en_tokenizer):
--- a/spacy/tests/lang/en/test_tagger.py
+++ b/spacy/tests/lang/en/test_tagger.py
@ -2,9 +2,9 @@
 from __future__ import unicode_literals

 from ....parts_of_speech import SPACE
+from ....compat import unicode_
 from ...util import get_doc

-import six
 import pytest


@ -24,8 +24,8 @@ def test_tag_names(EN):
    text = "I ate pizzas with anchovies."
    doc = EN(text, disable=['parser'])
    assert type(doc[2].pos) == int
-    assert isinstance(doc[2].pos_, six.text_type)
-    assert isinstance(doc[2].dep_, six.text_type)
+    assert isinstance(doc[2].pos_, unicode_)
+    assert isinstance(doc[2].dep_, unicode_)
    assert doc[2].tag_ == u'NNS'


--- a/spacy/tests/regression/test_issue1889.py
+++ b/spacy/tests/regression/test_issue1889.py
@ -0,0 +1,11 @@
+# coding: utf-8
+from __future__ import unicode_literals
+from ...lang.lex_attrs import is_stop
+from ...lang.en.stop_words import STOP_WORDS
+
+import pytest
+
+
+@pytest.mark.parametrize('word', ['the'])
+def test_lex_attrs_stop_words_case_sensitivity(word):
+    assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -186,6 +186,20 @@ cdef class Doc:
    def _(self):
        return Underscore(Underscore.doc_extensions, self)

+    @property
+    def is_sentenced(self):
+        # Check if the document has sentence boundaries,
+        # i.e at least one tok has the sent_start in (-1, 1)
+        if 'sents' in self.user_hooks:
+            return True
+        if self.is_parsed:
+            return True
+        for i in range(self.length):
+            if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
+                return True
+        else:
+            return False
+
    def __getitem__(self, object i):
        """Get a `Token` or `Span` object.

@ -515,29 +529,23 @@ cdef class Doc:
            >>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
        """
        def __get__(self):
+            if not self.is_sentenced:
+                raise ValueError(
+                    "Sentence boundaries unset. You can add the 'sentencizer' "
+                    "component to the pipeline with: "
+                    "nlp.add_pipe(nlp.create_pipe('sentencizer')) "
+                    "Alternatively, add the dependency parser, or set "
+                    "sentence boundaries by setting doc[i].sent_start")
            if 'sents' in self.user_hooks:
                yield from self.user_hooks['sents'](self)
-                return
-
-            cdef int i
-            if not self.is_parsed:
+            else:
+                start = 0
                for i in range(1, self.length):
-                    if self.c[i].sent_start != 0:
-                        break
-                else:
-                    raise ValueError(
-                        "Sentence boundaries unset. You can add the 'sentencizer' "
-                        "component to the pipeline with: "
-                        "nlp.add_pipe(nlp.create_pipe('sentencizer')) "
-                        "Alternatively, add the dependency parser, or set "
-                        "sentence boundaries by setting doc[i].sent_start")
-            start = 0
-            for i in range(1, self.length):
-                if self.c[i].sent_start == 1:
-                    yield Span(self, start, i)
-                    start = i
-            if start != self.length:
-                yield Span(self, start, self.length)
+                    if self.c[i].sent_start == 1:
+                        yield Span(self, start, i)
+                        start = i
+                if start != self.length:
+                    yield Span(self, start, self.length)

    cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
        if self.length == 0:
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -285,16 +285,39 @@ cdef class Span:
        def __get__(self):
            if 'sent' in self.doc.user_span_hooks:
                return self.doc.user_span_hooks['sent'](self)
-            # This should raise if we're not parsed.
+            # This should raise if we're not parsed
+            # or doesen't have any sbd component :)
            self.doc.sents
+            # if doc is parsed we can use the deps to find the sentence
+            # otherwise we use the `sent_start` token attribute
            cdef int n = 0
-            root = &self.doc.c[self.start]
-            while root.head != 0:
-                root += root.head
-                n += 1
-                if n >= self.doc.length:
-                    raise RuntimeError
-            return self.doc[root.l_edge:root.r_edge + 1]
+            cdef int i
+            if self.doc.is_parsed:
+                root = &self.doc.c[self.start]
+                while root.head != 0:
+                    root += root.head
+                    n += 1
+                    if n >= self.doc.length:
+                        raise RuntimeError
+                return self.doc[root.l_edge:root.r_edge + 1]
+            elif self.doc.is_sentenced:
+                # find start of the sentence
+                start = self.start
+                while self.doc.c[start].sent_start != 1 and start > 0:
+                    start += -1
+                # find end of the sentence
+                end = self.end
+                while self.doc.c[end].sent_start != 1:
+                    end += 1
+                    if n >= self.doc.length:
+                        break
+                #
+                return self.doc[start:end]
+            else:
+                raise ValueError(
+                    "Access to sentence requires either the dependency parse "
+                    "or sentence boundaries to be set by setting " +
+                    "doc[i].is_sent_start = True")

    property has_vector:
        """RETURNS (bool): Whether a word vector is associated with the object.
--- a/website/api/lexeme.jade
+++ b/website/api/lexeme.jade
@ -325,6 +325,12 @@ p The L2 norm of the lexeme's vector representation.
        +cell bool
        +cell Is the lexeme a quotation mark?

+    +row
+        +cell #[code is_currency]
+            +tag-new("2.0.8")
+        +cell bool
+        +cell Is the lexeme a currency symbol?
+
    +row
        +cell #[code like_url]
        +cell bool
--- a/website/api/matcher.jade
+++ b/website/api/matcher.jade
@ -111,6 +111,25 @@ p Match a stream of documents, yielding them in turn.
            |  parallel, if the #[code Matcher] implementation supports
            |  multi-threading.

+    +row
+        +cell #[code return_matches]
+            +tag-new(2.1)
+        +cell bool
+        +cell
+            |  Yield the match lists along with the docs, making results
+            |  #[code (doc, matches)] tuples.
+
+    +row
+        +cell #[code as_tuples]
+            +tag-new(2.1)
+        +cell bool
+        +cell
+            |  Interpret the input stream as #[code (doc, context)] tuples, and
+            |  yield #[code (result, context)] tuples out. If both
+            |  #[code return_matches] and #[code as_tuples] are #[code True],
+            |  the output will be a sequence of
+            |  #[code ((doc, matches), context)] tuples.
+
    +row("foot")
        +cell yields
        +cell #[code Doc]
--- a/website/api/token.jade
+++ b/website/api/token.jade
@ -740,6 +740,12 @@ p The L2 norm of the token's vector representation.
        +cell bool
        +cell Is the token a quotation mark?

+    +row
+        +cell #[code is_currency]
+            +tag-new("2.0.8")
+        +cell bool
+        +cell Is the token a currency symbol?
+
    +row
        +cell #[code like_url]
        +cell bool