Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-10-31 16:07:41 +03:00 · 2017-09-04 20:02:53 -05:00 · 2017-09-04 20:02:53 -05:00 · 1b65115bc2
commit 1b65115bc2
parent 3cf3fa1704 e88a42e460
11 changed files with 49 additions and 23 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -14,8 +14,7 @@ os:
 env:
  - VIA=compile LC_ALL=en_US.ascii 
  - VIA=compile
-
-#  - VIA=sdist
+  #- VIA=pypi_nightly

 install:
  - "./travis.sh"
@ -23,7 +22,7 @@ install:
 script:
  - "pip install pytest pytest-timeout"
  - if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi
-  - if [[ "${VIA}" == "pypi" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(ospath.dirname(spacy.__file__)))"`; fi
+  - if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
  - if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
  
 notifications:
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -212,12 +212,14 @@ class PrecomputableMaxouts(Model):

 def drop_layer(layer, factor=2.):
    def drop_layer_fwd(X, drop=0.):
-        drop *= factor
-        mask = layer.ops.get_dropout_mask((1,), drop)
-        if mask is None or mask > 0:
+        if drop <= 0.:
            return layer.begin_update(X, drop=drop)
        else:
-            return X, lambda dX, sgd=None: dX
+            coinflip = layer.ops.xp.random.random()
+            if (coinflip / factor) >= drop:
+                return layer.begin_update(X, drop=drop)
+            else:
+                return X, lambda dX, sgd=None: dX

    model = wrap(drop_layer_fwd, layer)
    model.predict = layer
@ -362,6 +364,8 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
    def backward(d_output, sgd=None):
        return (tokens, d_output)
    return vectors, backward
+
+
 def fine_tune(embedding, combine=None):
    if combine is not None:
        raise NotImplementedError(
--- a/spacy/about.py
+++ b/spacy/about.py
@ -3,7 +3,7 @@
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py

 __title__ = 'spacy-nightly'
-__version__ = '2.0.0a12'
+__version__ = '2.0.0a13'
 __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
 __uri__ = 'https://spacy.io'
 __author__ = 'Explosion AI'
--- a/spacy/lang/en/morph_rules.py
+++ b/spacy/lang/en/morph_rules.py
@ -59,7 +59,8 @@ MORPH_RULES = {

    "VBP": {
        "are":          {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
-        "'re":          {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}
+        "'re":          {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
+        "am":           {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"},
    },

    "VBD": {
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -44,6 +44,11 @@ class Lemmatizer(object):
            return True
        elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
            return True
+        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
+        # morphology
+        elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
+                                     morphology.get('Tense') == 'pres'):
+            return True
        elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
            return True
        elif VerbForm_inf in morphology:
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -142,7 +142,7 @@ class BaseThincComponent(object):

        deserialize = OrderedDict((
            ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
-            ('model', lambda b: self.model.from_bytes(b)),
+            ('model', load_model),
            ('vocab', lambda b: self.vocab.from_bytes(b))
        ))
        util.from_bytes(bytes_data, deserialize, exclude)
@ -417,7 +417,8 @@ class NeuralTagger(BaseThincComponent):
    def from_bytes(self, bytes_data, **exclude):
        def load_model(b):
            if self.model is True:
-                token_vector_width = util.env_opt('token_vector_width', 128)
+                token_vector_width = util.env_opt('token_vector_width',
+                        self.cfg.get('token_vector_width', 128))
                self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
            self.model.from_bytes(b)

@ -451,7 +452,8 @@ class NeuralTagger(BaseThincComponent):
    def from_disk(self, path, **exclude):
        def load_model(p):
            if self.model is True:
-                token_vector_width = util.env_opt('token_vector_width', 128)
+                token_vector_width = util.env_opt('token_vector_width',
+                        self.cfg.get('token_vector_width', 128))
                self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
            self.model.from_bytes(p.open('rb').read())

--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -393,7 +393,8 @@ cdef class Parser:

        tokvecs = self.model[0].ops.flatten(tokvecses)
        if USE_FINE_TUNE:
-            tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
+            # TODO: This is incorrect! Unhack when training next model
+            tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))

        nr_state = len(docs)
        nr_class = self.moves.n_moves
@ -531,8 +532,8 @@ cdef class Parser:
            docs = [docs]
            golds = [golds]
        if USE_FINE_TUNE:
-            tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
-            tokvecs = self.model[0].ops.flatten(tokvecs)
+            my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
+            tokvecs += self.model[0].ops.flatten(my_tokvecs)

        cuda_stream = get_cuda_stream()

@ -605,8 +606,8 @@ cdef class Parser:
        assert min(lengths) >= 1
        tokvecs = self.model[0].ops.flatten(tokvecs)
        if USE_FINE_TUNE:
-            tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
-            tokvecs = self.model[0].ops.flatten(tokvecs)
+            my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
+            tokvecs += self.model[0].ops.flatten(my_tokvecs)

        states = self.moves.init_batch(docs)
        for gold in golds:
@ -705,7 +706,7 @@ cdef class Parser:
                        lower, stream, drop=dropout)
        return state2vec, upper

-    nr_feature = 8
+    nr_feature = 13

    def get_token_ids(self, states):
        cdef StateClass state
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -13,7 +13,7 @@ from .. import util

 _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id',
              'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
-_models = {'en': ['en_depent_web_sm', 'en_core_web_md'],
+_models = {'en': ['en_core_web_sm'],
           'de': ['de_core_news_md'],
           'fr': ['fr_depvec_web_lg'],
           'xx': ['xx_ent_web_md']}
--- a/spacy/tests/lang/en/test_lemmatizer.py
+++ b/spacy/tests/lang/en/test_lemmatizer.py
@ -2,12 +2,18 @@
 from __future__ import unicode_literals

 import pytest
+from ....tokens.doc import Doc


@pytest.fixture
 def en_lemmatizer(EN):
    return EN.Defaults.create_lemmatizer()

+@pytest.mark.models('en')
+def test_doc_lemmatization(EN):
+    doc = Doc(EN.vocab, words=['bleed'])
+    doc[0].tag_ = 'VBP'
+    assert doc[0].lemma_ == 'bleed'

@pytest.mark.models('en')
@pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]),
@ -19,6 +25,16 @@ def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
    assert en_lemmatizer.noun(text) == set(lemmas)


+@pytest.mark.models('en')
+@pytest.mark.parametrize('text,lemmas', [("bleed", ["bleed"]),
+                                         ("feed", ["feed"]),
+                                         ("need", ["need"]),
+                                         ("ring", ["ring"]),
+                                         ("axes", ["axis", "axe", "ax"])])
+def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
+    assert en_lemmatizer.noun(text) == set(lemmas)
+
+
@pytest.mark.xfail
@pytest.mark.models('en')
 def test_en_lemmatizer_base_forms(en_lemmatizer):
--- a/spacy/tests/lang/en/test_tagger.py
+++ b/spacy/tests/lang/en/test_tagger.py
@ -25,7 +25,6 @@ def test_tag_names(EN):
    doc = EN(text, disable=['parser'])
    assert type(doc[2].pos) == int
    assert isinstance(doc[2].pos_, six.text_type)
-    assert type(doc[2].dep) == int
    assert isinstance(doc[2].dep_, six.text_type)
    assert doc[2].tag_ == u'NNS'

--- a/travis.sh
+++ b/travis.sh
@ -2,9 +2,8 @@

 if [ "${VIA}" == "pypi" ]; then
    rm -rf *
-    pip install spacy
-    python -m spacy.en.download
-    python -m spacy.de.download
+    pip install spacy-nightly
+    python -m spacy download en
 fi

 if [[ "${VIA}" == "sdist" && "${TRAVIS_PULL_REQUEST}" == "false" ]]; then