mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-25 00:34:20 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
97487122ea
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
||||||
SHELL := /bin/bash
|
SHELL := /bin/bash
|
||||||
sha = $(shell "git" "rev-parse" "--short" "HEAD")
|
sha = $(shell "git" "rev-parse" "--short" "HEAD")
|
||||||
|
|
||||||
dist/spacy.pex :
|
dist/spacy.pex : spacy/*.py* spacy/*/*.py*
|
||||||
python3.6 -m venv env3.6
|
python3.6 -m venv env3.6
|
||||||
source env3.6/bin/activate
|
source env3.6/bin/activate
|
||||||
env3.6/bin/pip install wheel
|
env3.6/bin/pip install wheel
|
||||||
|
|
|
@ -650,7 +650,7 @@ class Language(object):
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if name in disable:
|
if name in disable:
|
||||||
continue
|
continue
|
||||||
if not hasattr(proc, 'to_disk'):
|
if not hasattr(proc, 'from_disk'):
|
||||||
continue
|
continue
|
||||||
deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
|
deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
|
||||||
exclude = {p: False for p in disable}
|
exclude = {p: False for p in disable}
|
||||||
|
|
|
@ -95,7 +95,6 @@ def lemmatize(string, index, exceptions, rules):
|
||||||
forms = []
|
forms = []
|
||||||
forms.extend(exceptions.get(string, []))
|
forms.extend(exceptions.get(string, []))
|
||||||
oov_forms = []
|
oov_forms = []
|
||||||
if not forms:
|
|
||||||
for old, new in rules:
|
for old, new in rules:
|
||||||
if string.endswith(old):
|
if string.endswith(old):
|
||||||
form = string[:len(string) - len(old)] + new
|
form = string[:len(string) - len(old)] + new
|
||||||
|
|
|
@ -449,7 +449,10 @@ class Tagger(Pipe):
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle case where there are no tokens in any docs.
|
# Handle case where there are no tokens in any docs.
|
||||||
return [self.model.ops.allocate((0, self.model.nO)) for doc in docs]
|
n_labels = len(self.labels)
|
||||||
|
guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs]
|
||||||
|
tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO))
|
||||||
|
return guesses, tokvecs
|
||||||
tokvecs = self.model.tok2vec(docs)
|
tokvecs = self.model.tok2vec(docs)
|
||||||
scores = self.model.softmax(tokvecs)
|
scores = self.model.softmax(tokvecs)
|
||||||
guesses = []
|
guesses = []
|
||||||
|
@ -479,7 +482,7 @@ class Tagger(Pipe):
|
||||||
if lemma != 0 and lemma != doc.c[j].lex.orth:
|
if lemma != 0 and lemma != doc.c[j].lex.orth:
|
||||||
doc.c[j].lemma = lemma
|
doc.c[j].lemma = lemma
|
||||||
idx += 1
|
idx += 1
|
||||||
if tensors is not None:
|
if tensors is not None and len(tensors):
|
||||||
if isinstance(doc.tensor, numpy.ndarray) \
|
if isinstance(doc.tensor, numpy.ndarray) \
|
||||||
and not isinstance(tensors[i], numpy.ndarray):
|
and not isinstance(tensors[i], numpy.ndarray):
|
||||||
doc.extend_tensor(tensors[i].get())
|
doc.extend_tensor(tensors[i].get())
|
||||||
|
|
|
@ -217,6 +217,8 @@ cdef class Parser:
|
||||||
def predict(self, docs, beam_width=1, beam_density=0.0, drop=0.):
|
def predict(self, docs, beam_width=1, beam_density=0.0, drop=0.):
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
|
if not any(len(doc) for doc in docs):
|
||||||
|
return self.moves.init_batch(docs)
|
||||||
if beam_width < 2:
|
if beam_width < 2:
|
||||||
return self.greedy_parse(docs, drop=drop)
|
return self.greedy_parse(docs, drop=drop)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -3,8 +3,10 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...tokenizer import Tokenizer
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -27,3 +29,24 @@ def test_serialize_language_meta_disk(meta_data):
|
||||||
language.to_disk(d)
|
language.to_disk(d)
|
||||||
new_language = Language().from_disk(d)
|
new_language = Language().from_disk(d)
|
||||||
assert new_language.meta == language.meta
|
assert new_language.meta == language.meta
|
||||||
|
|
||||||
|
|
||||||
|
def test_serialize_with_custom_tokenizer():
|
||||||
|
"""Test that serialization with custom tokenizer works without token_match.
|
||||||
|
See: https://support.prodi.gy/t/how-to-save-a-custom-tokenizer/661/2
|
||||||
|
"""
|
||||||
|
prefix_re = re.compile(r'''1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:''')
|
||||||
|
suffix_re = re.compile(r'''''')
|
||||||
|
infix_re = re.compile(r'''[~]''')
|
||||||
|
|
||||||
|
def custom_tokenizer(nlp):
|
||||||
|
return Tokenizer(nlp.vocab,
|
||||||
|
{},
|
||||||
|
prefix_search=prefix_re.search,
|
||||||
|
suffix_search=suffix_re.search,
|
||||||
|
infix_finditer=infix_re.finditer)
|
||||||
|
|
||||||
|
nlp = Language()
|
||||||
|
nlp.tokenizer = custom_tokenizer(nlp)
|
||||||
|
with make_tempdir() as d:
|
||||||
|
nlp.to_disk(d)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user