Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-12-23 18:13:13 +03:00 · 2018-07-03 15:44:37 +02:00 · 2018-07-03 15:44:37 +02:00 · 97487122ea
commit 97487122ea
parent 6a89faf12e 3c3020fccc
6 changed files with 41 additions and 14 deletions
--- a/2
+++ b/2
@ -1,7 +1,7 @@
 SHELL := /bin/bash
 sha = $(shell "git" "rev-parse" "--short" "HEAD")
-dist/spacy.pex : 
+dist/spacy.pex : spacy/*.py* spacy/*/*.py*
 	python3.6 -m venv env3.6
 	source env3.6/bin/activate
 	env3.6/bin/pip install wheel
--- a/spacy/language.py
+++ b/spacy/language.py
@ -650,7 +650,7 @@ class Language(object):
        for name, proc in self.pipeline:
            if name in disable:
                continue
-            if not hasattr(proc, 'to_disk'):
+            if not hasattr(proc, 'from_disk'):
                continue
            deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
        exclude = {p: False for p in disable}
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -95,7 +95,6 @@ def lemmatize(string, index, exceptions, rules):
    forms = []
    forms.extend(exceptions.get(string, []))
    oov_forms = []
    if not forms:
    for old, new in rules:
        if string.endswith(old):
            form = string[:len(string) - len(old)] + new
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -449,7 +449,10 @@ class Tagger(Pipe):
    def predict(self, docs):
        if not any(len(doc) for doc in docs):
            # Handle case where there are no tokens in any docs.
-            return [self.model.ops.allocate((0, self.model.nO)) for doc in docs]
+            n_labels = len(self.labels)
            guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs]
            tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO))
            return guesses, tokvecs
        tokvecs = self.model.tok2vec(docs)
        scores = self.model.softmax(tokvecs)
        guesses = []
@ -479,7 +482,7 @@ class Tagger(Pipe):
                    if lemma != 0 and lemma != doc.c[j].lex.orth:
                        doc.c[j].lemma = lemma
                idx += 1
-            if tensors is not None:
+            if tensors is not None and len(tensors):
                if isinstance(doc.tensor, numpy.ndarray) \
                and not isinstance(tensors[i], numpy.ndarray):
                    doc.extend_tensor(tensors[i].get())
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -217,6 +217,8 @@ cdef class Parser:
    def predict(self, docs, beam_width=1, beam_density=0.0, drop=0.):
        if isinstance(docs, Doc):
            docs = [docs]
        if not any(len(doc) for doc in docs):
            return self.moves.init_batch(docs)
        if beam_width < 2:
            return self.greedy_parse(docs, drop=drop)
        else:
--- a/spacy/tests/serialize/test_serialize_language.py
+++ b/spacy/tests/serialize/test_serialize_language.py
@ -3,8 +3,10 @@ from __future__ import unicode_literals
 from ..util import make_tempdir
 from ...language import Language
 from ...tokenizer import Tokenizer
 import pytest
 import re
@pytest.fixture
@ -27,3 +29,24 @@ def test_serialize_language_meta_disk(meta_data):
        language.to_disk(d)
        new_language = Language().from_disk(d)
    assert new_language.meta == language.meta
 def test_serialize_with_custom_tokenizer():
    """Test that serialization with custom tokenizer works without token_match.
    See: https://support.prodi.gy/t/how-to-save-a-custom-tokenizer/661/2
    """
    prefix_re = re.compile(r'''1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:''')
    suffix_re = re.compile(r'''''')
    infix_re = re.compile(r'''[~]''')
    def custom_tokenizer(nlp):
        return Tokenizer(nlp.vocab,
                         {},
                         prefix_search=prefix_re.search,
                         suffix_search=suffix_re.search,
                         infix_finditer=infix_re.finditer)
    nlp = Language()
    nlp.tokenizer = custom_tokenizer(nlp)
    with make_tempdir() as d:
        nlp.to_disk(d)