spaCy/spacy/tests/regression/test_issue910.py

from __future__ import unicode_literals
import json
import random
import contextlib
import shutil
import pytest
import tempfile
from pathlib import Path


from ...gold import GoldParse
from ...pipeline import EntityRecognizer
from ...lang.en import English

try:
    unicode
except NameError:
    unicode = str


@pytest.fixture
def train_data():
    return [
            ["hey",[]],
            ["howdy",[]],
            ["hey there",[]],
            ["hello",[]],
            ["hi",[]],
            ["i'm looking for a place to eat",[]],
            ["i'm looking for a place in the north of town",[[31,36,"location"]]],
            ["show me chinese restaurants",[[8,15,"cuisine"]]],
            ["show me chines restaurants",[[8,14,"cuisine"]]],
            ["yes",[]],
            ["yep",[]],
            ["yeah",[]],
            ["show me a mexican place in the centre",[[31,37,"location"], [10,17,"cuisine"]]],
            ["bye",[]],["goodbye",[]],
            ["good bye",[]],
            ["stop",[]],
            ["end",[]],
            ["i am looking for an indian spot",[[20,26,"cuisine"]]],
            ["search for restaurants",[]],
            ["anywhere in the west",[[16,20,"location"]]],
            ["central indian restaurant",[[0,7,"location"],[8,14,"cuisine"]]],
            ["indeed",[]],
            ["that's right",[]],
            ["ok",[]],
            ["great",[]]
    ]

@pytest.fixture
def additional_entity_types():
    return ['cuisine', 'location']


@contextlib.contextmanager
def temp_save_model(model):
    model_dir = tempfile.mkdtemp()
    model.to_disk(model_dir)
    yield model_dir
    shutil.rmtree(model_dir.as_posix())


@pytest.mark.xfail
@pytest.mark.models('en')
def test_issue910(EN, train_data, additional_entity_types):
    '''Test that adding entities and resuming training works passably OK.
    There are two issues here:

    1) We have to readd labels. This isn't very nice.
    2) There's no way to set the learning rate for the weight update, so we
        end up out-of-scale, causing it to learn too fast.
    '''
    nlp = EN
    doc = nlp(u"I am looking for a restaurant in Berlin")
    ents_before_train = [(ent.label_, ent.text) for ent in doc.ents]
    # Fine tune the ner model
    for entity_type in additional_entity_types:
        nlp.entity.add_label(entity_type)

    sgd = Adam(nlp.entity.model[0].ops, 0.001)
    for itn in range(10):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            nlp.tagger(doc)
            nlp.tensorizer(doc)
            gold = GoldParse(doc, entities=entity_offsets)
            loss = nlp.entity.update(doc, gold, sgd=sgd, drop=0.5)

    with temp_save_model(nlp.entity) as model_dir:
        # Load the fine tuned model
        loaded_ner = EntityRecognizer(nlp.vocab)
        loaded_ner.from_disk(model_dir)

    for raw_text, entity_offsets in train_data:
        doc = nlp.make_doc(raw_text)
        nlp.tagger(doc)
        loaded_ner(doc)
        ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}
        for start, end, label in entity_offsets:
            if (start, end) not in ents:
                print(ents)
            assert ents[(start, end)] == label
Add test for Issue #910: Resuming entity training 2017-03-24 01:38:57 +03:00			`from __future__ import unicode_literals`
			`import json`
			`import random`
			`import contextlib`
			`import shutil`
			`import pytest`
			`import tempfile`
			`from pathlib import Path`


			`from ...gold import GoldParse`
			`from ...pipeline import EntityRecognizer`
Remove imports in /lang/__init__.py 2017-05-09 00:58:07 +03:00			`from ...lang.en import English`
Add test for Issue #910: Resuming entity training 2017-03-24 01:38:57 +03:00
			`try:`
			`unicode`
			`except NameError:`
			`unicode = str`


			`@pytest.fixture`
			`def train_data():`
			`return [`
			`["hey",[]],`
			`["howdy",[]],`
			`["hey there",[]],`
			`["hello",[]],`
			`["hi",[]],`
			`["i'm looking for a place to eat",[]],`
			`["i'm looking for a place in the north of town",[[31,36,"location"]]],`
			`["show me chinese restaurants",[[8,15,"cuisine"]]],`
			`["show me chines restaurants",[[8,14,"cuisine"]]],`
			`["yes",[]],`
			`["yep",[]],`
			`["yeah",[]],`
			`["show me a mexican place in the centre",[[31,37,"location"], [10,17,"cuisine"]]],`
			`["bye",[]],["goodbye",[]],`
			`["good bye",[]],`
			`["stop",[]],`
			`["end",[]],`
			`["i am looking for an indian spot",[[20,26,"cuisine"]]],`
			`["search for restaurants",[]],`
			`["anywhere in the west",[[16,20,"location"]]],`
			`["central indian restaurant",[[0,7,"location"],[8,14,"cuisine"]]],`
			`["indeed",[]],`
			`["that's right",[]],`
			`["ok",[]],`
			`["great",[]]`
			`]`

			`@pytest.fixture`
			`def additional_entity_types():`
			`return ['cuisine', 'location']`


			`@contextlib.contextmanager`
			`def temp_save_model(model):`
Fix tests 2017-06-05 00:00:44 +03:00			`model_dir = tempfile.mkdtemp()`
			`model.to_disk(model_dir)`
Add test for Issue #910: Resuming entity training 2017-03-24 01:38:57 +03:00			`yield model_dir`
			`shutil.rmtree(model_dir.as_posix())`


Fix tests 2017-06-05 00:00:44 +03:00			`@pytest.mark.xfail`
Update model fixtures and reorganise tests 2017-05-29 23:14:31 +03:00			`@pytest.mark.models('en')`
			`def test_issue910(EN, train_data, additional_entity_types):`
Add test for Issue #910: Resuming entity training 2017-03-24 01:38:57 +03:00			`'''Test that adding entities and resuming training works passably OK.`
			`There are two issues here:`

			`1) We have to readd labels. This isn't very nice.`
			`2) There's no way to set the learning rate for the weight update, so we`
			`end up out-of-scale, causing it to learn too fast.`
			`'''`
Fix typo 2017-06-04 23:36:40 +03:00			`nlp = EN`
			`doc = nlp(u"I am looking for a restaurant in Berlin")`
Add test for Issue #910: Resuming entity training 2017-03-24 01:38:57 +03:00			`ents_before_train = [(ent.label_, ent.text) for ent in doc.ents]`
			`# Fine tune the ner model`
			`for entity_type in additional_entity_types:`
Remove xfail on Test #910 2017-04-23 17:28:55 +03:00			`nlp.entity.add_label(entity_type)`
Add test for Issue #910: Resuming entity training 2017-03-24 01:38:57 +03:00
Fix tests 2017-06-05 00:00:44 +03:00			`sgd = Adam(nlp.entity.model[0].ops, 0.001)`
Remove xfail on Test #910 2017-04-23 17:28:55 +03:00			`for itn in range(10):`
Add test for Issue #910: Resuming entity training 2017-03-24 01:38:57 +03:00			`random.shuffle(train_data)`
			`for raw_text, entity_offsets in train_data:`
			`doc = nlp.make_doc(raw_text)`
			`nlp.tagger(doc)`
Fix tests 2017-06-05 00:00:44 +03:00			`nlp.tensorizer(doc)`
Add test for Issue #910: Resuming entity training 2017-03-24 01:38:57 +03:00			`gold = GoldParse(doc, entities=entity_offsets)`
Fix tests 2017-06-05 00:00:44 +03:00			`loss = nlp.entity.update(doc, gold, sgd=sgd, drop=0.5)`
Add test for Issue #910: Resuming entity training 2017-03-24 01:38:57 +03:00
			`with temp_save_model(nlp.entity) as model_dir:`
			`# Load the fine tuned model`
Fix tests 2017-06-05 00:00:44 +03:00			`loaded_ner = EntityRecognizer(nlp.vocab)`
			`loaded_ner.from_disk(model_dir)`
Add test for Issue #910: Resuming entity training 2017-03-24 01:38:57 +03:00
Remove xfail on Test #910 2017-04-23 17:28:55 +03:00			`for raw_text, entity_offsets in train_data:`
			`doc = nlp.make_doc(raw_text)`
			`nlp.tagger(doc)`
			`loaded_ner(doc)`
			`ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}`
			`for start, end, label in entity_offsets:`
			`if (start, end) not in ents:`
			`print(ents)`
			`assert ents[(start, end)] == label`