spaCy/spacy/tests/regression/test_issue999.py

from __future__ import unicode_literals
import os
import random
import contextlib
import shutil
import pytest
import tempfile
from pathlib import Path


import pathlib
from ...gold import GoldParse
from ...pipeline import EntityRecognizer
from ...language import Language

try:
    unicode
except NameError:
    unicode = str


@pytest.fixture
def train_data():
    return [
            ["hey",[]],
            ["howdy",[]],
            ["hey there",[]],
            ["hello",[]],
            ["hi",[]],
            ["i'm looking for a place to eat",[]],
            ["i'm looking for a place in the north of town",[[31,36,"location"]]],
            ["show me chinese restaurants",[[8,15,"cuisine"]]],
            ["show me chines restaurants",[[8,14,"cuisine"]]],
    ]


@contextlib.contextmanager
def temp_save_model(model):
    model_dir = Path(tempfile.mkdtemp())
    model.save_to_directory(model_dir)
    yield model_dir
    shutil.rmtree(model_dir.as_posix())


# TODO: Fix when saving/loading is fixed.
@pytest.mark.xfail
def test_issue999(train_data):
    '''Test that adding entities and resuming training works passably OK.
    There are two issues here:

    1) We have to readd labels. This isn't very nice.
    2) There's no way to set the learning rate for the weight update, so we
        end up out-of-scale, causing it to learn too fast.
    '''
    nlp = Language(pipeline=[])
    nlp.entity = EntityRecognizer(nlp.vocab, features=Language.Defaults.entity_features)
    nlp.pipeline.append(nlp.entity)
    for _, offsets in train_data:
        for start, end, ent_type in offsets:
            nlp.entity.add_label(ent_type)
    nlp.entity.model.learn_rate = 0.001
    for itn in range(100):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            gold = GoldParse(doc, entities=entity_offsets)
            loss = nlp.entity.update(doc, gold)

    with temp_save_model(nlp) as model_dir:
        nlp2 = Language(path=model_dir)

    for raw_text, entity_offsets in train_data:
        doc = nlp2(raw_text)
        ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}
        for start, end, label in entity_offsets:
            if (start, end) in ents:
                assert ents[(start, end)] == label
                break
        else:
            if entity_offsets:
                raise Exception(ents)
Add test for Issue #999 2017-04-23 18:06:30 +03:00			`from __future__ import unicode_literals`
			`import os`
			`import random`
			`import contextlib`
			`import shutil`
			`import pytest`
			`import tempfile`
			`from pathlib import Path`


			`import pathlib`
			`from ...gold import GoldParse`
			`from ...pipeline import EntityRecognizer`
Try to make test999 less flakey 2017-04-26 19:42:06 +03:00			`from ...language import Language`
Add test for Issue #999 2017-04-23 18:06:30 +03:00
			`try:`
			`unicode`
			`except NameError:`
			`unicode = str`


			`@pytest.fixture`
			`def train_data():`
			`return [`
			`["hey",[]],`
			`["howdy",[]],`
			`["hey there",[]],`
			`["hello",[]],`
			`["hi",[]],`
			`["i'm looking for a place to eat",[]],`
			`["i'm looking for a place in the north of town",[[31,36,"location"]]],`
			`["show me chinese restaurants",[[8,15,"cuisine"]]],`
			`["show me chines restaurants",[[8,14,"cuisine"]]],`
			`]`


			`@contextlib.contextmanager`
			`def temp_save_model(model):`
			`model_dir = Path(tempfile.mkdtemp())`
			`model.save_to_directory(model_dir)`
			`yield model_dir`
			`shutil.rmtree(model_dir.as_posix())`


Redesign training to integrate NN components * Obsolete .parser, .entity etc names in favour of .pipeline * Components no longer create models on initialization * Models created by loading method (from_disk(), from_bytes() etc), or .begin_training() * Add .predict(), .set_annotations() methods in components * Pass state through pipeline, to allow components to share information more flexibly. 2017-05-16 17:17:30 +03:00			`# TODO: Fix when saving/loading is fixed.`
			`@pytest.mark.xfail`
Add test for Issue #999 2017-04-23 18:06:30 +03:00			`def test_issue999(train_data):`
			`'''Test that adding entities and resuming training works passably OK.`
			`There are two issues here:`

			`1) We have to readd labels. This isn't very nice.`
			`2) There's no way to set the learning rate for the weight update, so we`
			`end up out-of-scale, causing it to learn too fast.`
			`'''`
Redesign training to integrate NN components * Obsolete .parser, .entity etc names in favour of .pipeline * Components no longer create models on initialization * Models created by loading method (from_disk(), from_bytes() etc), or .begin_training() * Add .predict(), .set_annotations() methods in components * Pass state through pipeline, to allow components to share information more flexibly. 2017-05-16 17:17:30 +03:00			`nlp = Language(pipeline=[])`
Try to make test999 less flakey 2017-04-26 19:42:06 +03:00			`nlp.entity = EntityRecognizer(nlp.vocab, features=Language.Defaults.entity_features)`
Redesign training to integrate NN components * Obsolete .parser, .entity etc names in favour of .pipeline * Components no longer create models on initialization * Models created by loading method (from_disk(), from_bytes() etc), or .begin_training() * Add .predict(), .set_annotations() methods in components * Pass state through pipeline, to allow components to share information more flexibly. 2017-05-16 17:17:30 +03:00			`nlp.pipeline.append(nlp.entity)`
Add test for Issue #999 2017-04-23 18:06:30 +03:00			`for _, offsets in train_data:`
			`for start, end, ent_type in offsets:`
			`nlp.entity.add_label(ent_type)`
Fix test 2017-04-24 01:25:55 +03:00			`nlp.entity.model.learn_rate = 0.001`
Make training test less flakey 2017-04-23 23:59:34 +03:00			`for itn in range(100):`
Add test for Issue #999 2017-04-23 18:06:30 +03:00			`random.shuffle(train_data)`
			`for raw_text, entity_offsets in train_data:`
			`doc = nlp.make_doc(raw_text)`
			`gold = GoldParse(doc, entities=entity_offsets)`
			`loss = nlp.entity.update(doc, gold)`

			`with temp_save_model(nlp) as model_dir:`
Try to make test999 less flakey 2017-04-26 19:42:06 +03:00			`nlp2 = Language(path=model_dir)`
Add test for Issue #999 2017-04-23 18:06:30 +03:00
			`for raw_text, entity_offsets in train_data:`
			`doc = nlp2(raw_text)`
			`ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}`
			`for start, end, label in entity_offsets:`
Fix flakey test 2017-04-24 01:06:30 +03:00			`if (start, end) in ents:`
			`assert ents[(start, end)] == label`
			`break`
			`else:`
Fix test 2017-04-24 01:25:55 +03:00			`if entity_offsets:`
			`raise Exception(ents)`