mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
1b65115bc2
|
@ -14,8 +14,7 @@ os:
|
||||||
env:
|
env:
|
||||||
- VIA=compile LC_ALL=en_US.ascii
|
- VIA=compile LC_ALL=en_US.ascii
|
||||||
- VIA=compile
|
- VIA=compile
|
||||||
|
#- VIA=pypi_nightly
|
||||||
# - VIA=sdist
|
|
||||||
|
|
||||||
install:
|
install:
|
||||||
- "./travis.sh"
|
- "./travis.sh"
|
||||||
|
@ -23,7 +22,7 @@ install:
|
||||||
script:
|
script:
|
||||||
- "pip install pytest pytest-timeout"
|
- "pip install pytest pytest-timeout"
|
||||||
- if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi
|
- if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi
|
||||||
- if [[ "${VIA}" == "pypi" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(ospath.dirname(spacy.__file__)))"`; fi
|
- if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
|
||||||
- if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
|
- if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
|
||||||
|
|
||||||
notifications:
|
notifications:
|
||||||
|
|
12
spacy/_ml.py
12
spacy/_ml.py
|
@ -212,12 +212,14 @@ class PrecomputableMaxouts(Model):
|
||||||
|
|
||||||
def drop_layer(layer, factor=2.):
|
def drop_layer(layer, factor=2.):
|
||||||
def drop_layer_fwd(X, drop=0.):
|
def drop_layer_fwd(X, drop=0.):
|
||||||
drop *= factor
|
if drop <= 0.:
|
||||||
mask = layer.ops.get_dropout_mask((1,), drop)
|
|
||||||
if mask is None or mask > 0:
|
|
||||||
return layer.begin_update(X, drop=drop)
|
return layer.begin_update(X, drop=drop)
|
||||||
else:
|
else:
|
||||||
return X, lambda dX, sgd=None: dX
|
coinflip = layer.ops.xp.random.random()
|
||||||
|
if (coinflip / factor) >= drop:
|
||||||
|
return layer.begin_update(X, drop=drop)
|
||||||
|
else:
|
||||||
|
return X, lambda dX, sgd=None: dX
|
||||||
|
|
||||||
model = wrap(drop_layer_fwd, layer)
|
model = wrap(drop_layer_fwd, layer)
|
||||||
model.predict = layer
|
model.predict = layer
|
||||||
|
@ -362,6 +364,8 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
|
||||||
def backward(d_output, sgd=None):
|
def backward(d_output, sgd=None):
|
||||||
return (tokens, d_output)
|
return (tokens, d_output)
|
||||||
return vectors, backward
|
return vectors, backward
|
||||||
|
|
||||||
|
|
||||||
def fine_tune(embedding, combine=None):
|
def fine_tune(embedding, combine=None):
|
||||||
if combine is not None:
|
if combine is not None:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||||
|
|
||||||
__title__ = 'spacy-nightly'
|
__title__ = 'spacy-nightly'
|
||||||
__version__ = '2.0.0a12'
|
__version__ = '2.0.0a13'
|
||||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
||||||
__uri__ = 'https://spacy.io'
|
__uri__ = 'https://spacy.io'
|
||||||
__author__ = 'Explosion AI'
|
__author__ = 'Explosion AI'
|
||||||
|
|
|
@ -59,7 +59,8 @@ MORPH_RULES = {
|
||||||
|
|
||||||
"VBP": {
|
"VBP": {
|
||||||
"are": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
|
"are": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
|
||||||
"'re": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}
|
"'re": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
|
||||||
|
"am": {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"},
|
||||||
},
|
},
|
||||||
|
|
||||||
"VBD": {
|
"VBD": {
|
||||||
|
|
|
@ -44,6 +44,11 @@ class Lemmatizer(object):
|
||||||
return True
|
return True
|
||||||
elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
|
elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
|
||||||
return True
|
return True
|
||||||
|
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||||
|
# morphology
|
||||||
|
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
|
||||||
|
morphology.get('Tense') == 'pres'):
|
||||||
|
return True
|
||||||
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
|
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
|
||||||
return True
|
return True
|
||||||
elif VerbForm_inf in morphology:
|
elif VerbForm_inf in morphology:
|
||||||
|
|
|
@ -142,7 +142,7 @@ class BaseThincComponent(object):
|
||||||
|
|
||||||
deserialize = OrderedDict((
|
deserialize = OrderedDict((
|
||||||
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
||||||
('model', lambda b: self.model.from_bytes(b)),
|
('model', load_model),
|
||||||
('vocab', lambda b: self.vocab.from_bytes(b))
|
('vocab', lambda b: self.vocab.from_bytes(b))
|
||||||
))
|
))
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
|
@ -417,7 +417,8 @@ class NeuralTagger(BaseThincComponent):
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
token_vector_width = util.env_opt('token_vector_width', 128)
|
token_vector_width = util.env_opt('token_vector_width',
|
||||||
|
self.cfg.get('token_vector_width', 128))
|
||||||
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
|
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
|
|
||||||
|
@ -451,7 +452,8 @@ class NeuralTagger(BaseThincComponent):
|
||||||
def from_disk(self, path, **exclude):
|
def from_disk(self, path, **exclude):
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
token_vector_width = util.env_opt('token_vector_width', 128)
|
token_vector_width = util.env_opt('token_vector_width',
|
||||||
|
self.cfg.get('token_vector_width', 128))
|
||||||
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
|
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
|
||||||
self.model.from_bytes(p.open('rb').read())
|
self.model.from_bytes(p.open('rb').read())
|
||||||
|
|
||||||
|
|
|
@ -393,7 +393,8 @@ cdef class Parser:
|
||||||
|
|
||||||
tokvecs = self.model[0].ops.flatten(tokvecses)
|
tokvecs = self.model[0].ops.flatten(tokvecses)
|
||||||
if USE_FINE_TUNE:
|
if USE_FINE_TUNE:
|
||||||
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
# TODO: This is incorrect! Unhack when training next model
|
||||||
|
tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
||||||
|
|
||||||
nr_state = len(docs)
|
nr_state = len(docs)
|
||||||
nr_class = self.moves.n_moves
|
nr_class = self.moves.n_moves
|
||||||
|
@ -531,8 +532,8 @@ cdef class Parser:
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
golds = [golds]
|
golds = [golds]
|
||||||
if USE_FINE_TUNE:
|
if USE_FINE_TUNE:
|
||||||
tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
|
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
|
||||||
tokvecs = self.model[0].ops.flatten(tokvecs)
|
tokvecs += self.model[0].ops.flatten(my_tokvecs)
|
||||||
|
|
||||||
cuda_stream = get_cuda_stream()
|
cuda_stream = get_cuda_stream()
|
||||||
|
|
||||||
|
@ -605,8 +606,8 @@ cdef class Parser:
|
||||||
assert min(lengths) >= 1
|
assert min(lengths) >= 1
|
||||||
tokvecs = self.model[0].ops.flatten(tokvecs)
|
tokvecs = self.model[0].ops.flatten(tokvecs)
|
||||||
if USE_FINE_TUNE:
|
if USE_FINE_TUNE:
|
||||||
tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
|
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
|
||||||
tokvecs = self.model[0].ops.flatten(tokvecs)
|
tokvecs += self.model[0].ops.flatten(my_tokvecs)
|
||||||
|
|
||||||
states = self.moves.init_batch(docs)
|
states = self.moves.init_batch(docs)
|
||||||
for gold in golds:
|
for gold in golds:
|
||||||
|
@ -705,7 +706,7 @@ cdef class Parser:
|
||||||
lower, stream, drop=dropout)
|
lower, stream, drop=dropout)
|
||||||
return state2vec, upper
|
return state2vec, upper
|
||||||
|
|
||||||
nr_feature = 8
|
nr_feature = 13
|
||||||
|
|
||||||
def get_token_ids(self, states):
|
def get_token_ids(self, states):
|
||||||
cdef StateClass state
|
cdef StateClass state
|
||||||
|
|
|
@ -13,7 +13,7 @@ from .. import util
|
||||||
|
|
||||||
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id',
|
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id',
|
||||||
'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
|
'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
|
||||||
_models = {'en': ['en_depent_web_sm', 'en_core_web_md'],
|
_models = {'en': ['en_core_web_sm'],
|
||||||
'de': ['de_core_news_md'],
|
'de': ['de_core_news_md'],
|
||||||
'fr': ['fr_depvec_web_lg'],
|
'fr': ['fr_depvec_web_lg'],
|
||||||
'xx': ['xx_ent_web_md']}
|
'xx': ['xx_ent_web_md']}
|
||||||
|
|
|
@ -2,12 +2,18 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from ....tokens.doc import Doc
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def en_lemmatizer(EN):
|
def en_lemmatizer(EN):
|
||||||
return EN.Defaults.create_lemmatizer()
|
return EN.Defaults.create_lemmatizer()
|
||||||
|
|
||||||
|
@pytest.mark.models('en')
|
||||||
|
def test_doc_lemmatization(EN):
|
||||||
|
doc = Doc(EN.vocab, words=['bleed'])
|
||||||
|
doc[0].tag_ = 'VBP'
|
||||||
|
assert doc[0].lemma_ == 'bleed'
|
||||||
|
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
@pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]),
|
@pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]),
|
||||||
|
@ -19,6 +25,16 @@ def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
|
||||||
assert en_lemmatizer.noun(text) == set(lemmas)
|
assert en_lemmatizer.noun(text) == set(lemmas)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models('en')
|
||||||
|
@pytest.mark.parametrize('text,lemmas', [("bleed", ["bleed"]),
|
||||||
|
("feed", ["feed"]),
|
||||||
|
("need", ["need"]),
|
||||||
|
("ring", ["ring"]),
|
||||||
|
("axes", ["axis", "axe", "ax"])])
|
||||||
|
def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
|
||||||
|
assert en_lemmatizer.noun(text) == set(lemmas)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
def test_en_lemmatizer_base_forms(en_lemmatizer):
|
def test_en_lemmatizer_base_forms(en_lemmatizer):
|
||||||
|
|
|
@ -25,7 +25,6 @@ def test_tag_names(EN):
|
||||||
doc = EN(text, disable=['parser'])
|
doc = EN(text, disable=['parser'])
|
||||||
assert type(doc[2].pos) == int
|
assert type(doc[2].pos) == int
|
||||||
assert isinstance(doc[2].pos_, six.text_type)
|
assert isinstance(doc[2].pos_, six.text_type)
|
||||||
assert type(doc[2].dep) == int
|
|
||||||
assert isinstance(doc[2].dep_, six.text_type)
|
assert isinstance(doc[2].dep_, six.text_type)
|
||||||
assert doc[2].tag_ == u'NNS'
|
assert doc[2].tag_ == u'NNS'
|
||||||
|
|
||||||
|
|
|
@ -2,9 +2,8 @@
|
||||||
|
|
||||||
if [ "${VIA}" == "pypi" ]; then
|
if [ "${VIA}" == "pypi" ]; then
|
||||||
rm -rf *
|
rm -rf *
|
||||||
pip install spacy
|
pip install spacy-nightly
|
||||||
python -m spacy.en.download
|
python -m spacy download en
|
||||||
python -m spacy.de.download
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ "${VIA}" == "sdist" && "${TRAVIS_PULL_REQUEST}" == "false" ]]; then
|
if [[ "${VIA}" == "sdist" && "${TRAVIS_PULL_REQUEST}" == "false" ]]; then
|
||||||
|
|
Loading…
Reference in New Issue
Block a user